/src/leptonica/src/pdfio2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*====================================================================* |
2 | | - Copyright (C) 2001 Leptonica. All rights reserved. |
3 | | - |
4 | | - Redistribution and use in source and binary forms, with or without |
5 | | - modification, are permitted provided that the following conditions |
6 | | - are met: |
7 | | - 1. Redistributions of source code must retain the above copyright |
8 | | - notice, this list of conditions and the following disclaimer. |
9 | | - 2. Redistributions in binary form must reproduce the above |
10 | | - copyright notice, this list of conditions and the following |
11 | | - disclaimer in the documentation and/or other materials |
12 | | - provided with the distribution. |
13 | | - |
14 | | - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
15 | | - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
16 | | - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
17 | | - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY |
18 | | - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
19 | | - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
20 | | - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
21 | | - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
22 | | - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
23 | | - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
24 | | - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
25 | | *====================================================================*/ |
26 | | |
27 | | /*! |
28 | | * \file pdfio2.c |
29 | | * <pre> |
30 | | * |
31 | | * Lower-level operations for generating pdf. |
32 | | * |
33 | | * Intermediate function for single page, multi-image conversion |
34 | | * l_int32 pixConvertToPdfData() |
35 | | * |
36 | | * Intermediate function for generating multipage pdf output |
37 | | * l_int32 ptraConcatenatePdfToData() |
38 | | * |
39 | | * Convert tiff multipage to pdf file |
40 | | * l_int32 convertTiffMultipageToPdf() |
41 | | * |
42 | | * Generates the CID, transcoding under some conditions |
43 | | * l_int32 l_generateCIDataForPdf() |
44 | | * l_int32 l_generateCIData() |
45 | | * |
46 | | * Lower-level CID generation without transcoding |
47 | | * L_COMP_DATA *l_generateFlateDataPdf() |
48 | | * L_COMP_DATA *l_generateJpegData() |
49 | | * L_COMP_DATA *l_generateJpegDataMem() |
50 | | * static L_COMP_DATA *l_generateJp2kData() |
51 | | * L_COMP_DATA *l_generateG4Data() |
52 | | * |
53 | | * Lower-level CID generation with transcoding |
54 | | * l_int32 pixGenerateCIData() |
55 | | * L_COMP_DATA *l_generateFlateData() |
56 | | * static L_COMP_DATA *pixGenerateFlateData() |
57 | | * static L_COMP_DATA *pixGenerateJpegData() |
58 | | * static L_COMP_DATA *pixGenerateJp2kData() |
59 | | * static L_COMP_DATA *pixGenerateG4Data() |
60 | | * |
61 | | * Other CID operations |
62 | | * l_int32 cidConvertToPdfData() |
63 | | * void l_CIDataDestroy() |
64 | | * |
65 | | * Helper functions for generating the output pdf string |
66 | | * static l_int32 l_generatePdf() |
67 | | * static void generateFixedStringsPdf() |
68 | | * static char *generateEscapeString() |
69 | | * static void generateMediaboxPdf() |
70 | | * static l_int32 generatePageStringPdf() |
71 | | * static l_int32 generateContentStringPdf() |
72 | | * static l_int32 generatePreXStringsPdf() |
73 | | * static l_int32 generateColormapStringsPdf() |
74 | | * static void generateTrailerPdf() |
75 | | * static l_int32 makeTrailerStringPdf() |
76 | | * static l_int32 generateOutputDataPdf() |
77 | | * |
78 | | * Helper functions for generating multipage pdf output |
79 | | * static l_int32 parseTrailerPdf() |
80 | | * static char *generatePagesObjStringPdf() |
81 | | * static L_BYTEA *substituteObjectNumbers() |
82 | | * |
83 | | * Create/destroy/access pdf data |
84 | | * static L_PDF_DATA *pdfdataCreate() |
85 | | * static void pdfdataDestroy() |
86 | | * static L_COMP_DATA *pdfdataGetCid() |
87 | | * |
88 | | * Find number of pages in a pdf |
89 | | * l_int32 getPdfPageCount() |
90 | | * |
91 | | * Find widths and heights of pages and media boxes in a pdf |
92 | | * l_int32 getPdfPageSizes() |
93 | | * l_int32 getPdfMediaBoxSizes() |
94 | | * |
95 | | * Find effective resolution of images rendered from a pdf |
96 | | * l_int32 getPdfRendererResolution() |
97 | | * |
98 | | * Set flags for special modes |
99 | | * void l_pdfSetG4ImageMask() |
100 | | * void l_pdfSetDateAndVersion() |
101 | | * |
102 | | * </pre> |
103 | | */ |
104 | | |
105 | | #ifdef HAVE_CONFIG_H |
106 | | #include <config_auto.h> |
107 | | #endif /* HAVE_CONFIG_H */ |
108 | | |
109 | | #include <string.h> |
110 | | #include <math.h> |
111 | | #include "allheaders.h" |
112 | | |
113 | | /* --------------------------------------------*/ |
114 | | #if USE_PDFIO /* defined in environ.h */ |
115 | | /* --------------------------------------------*/ |
116 | | |
117 | | /* Typical scan resolution in ppi (pixels/inch) */ |
118 | | static const l_int32 DefaultInputRes = 300; |
119 | | |
120 | | /* Static helpers */ |
121 | | static L_COMP_DATA *l_generateJp2kData(const char *fname); |
122 | | static L_COMP_DATA *pixGenerateFlateData(PIX *pixs, l_int32 ascii85flag); |
123 | | static L_COMP_DATA *pixGenerateJpegData(PIX *pixs, l_int32 ascii85flag, |
124 | | l_int32 quality); |
125 | | static L_COMP_DATA *pixGenerateJp2kData(PIX *pixs, l_int32 quality); |
126 | | static L_COMP_DATA *pixGenerateG4Data(PIX *pixs, l_int32 ascii85flag); |
127 | | |
128 | | static l_int32 l_generatePdf(l_uint8 **pdata, size_t *pnbytes, |
129 | | L_PDF_DATA *lpd); |
130 | | static void generateFixedStringsPdf(L_PDF_DATA *lpd); |
131 | | static char *generateEscapeString(const char *str); |
132 | | static void generateMediaboxPdf(L_PDF_DATA *lpd); |
133 | | static l_int32 generatePageStringPdf(L_PDF_DATA *lpd); |
134 | | static l_int32 generateContentStringPdf(L_PDF_DATA *lpd); |
135 | | static l_int32 generatePreXStringsPdf(L_PDF_DATA *lpd); |
136 | | static l_int32 generateColormapStringsPdf(L_PDF_DATA *lpd); |
137 | | static void generateTrailerPdf(L_PDF_DATA *lpd); |
138 | | static char *makeTrailerStringPdf(L_DNA *daloc); |
139 | | static l_int32 generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes, |
140 | | L_PDF_DATA *lpd); |
141 | | |
142 | | static l_int32 parseTrailerPdf(L_BYTEA *bas, L_DNA **pda); |
143 | | static char *generatePagesObjStringPdf(NUMA *napage); |
144 | | static L_BYTEA *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs); |
145 | | |
146 | | static L_PDF_DATA *pdfdataCreate(const char *title); |
147 | | static void pdfdataDestroy(L_PDF_DATA **plpd); |
148 | | static L_COMP_DATA *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index); |
149 | | |
150 | | |
151 | | /* ---------------- Defaults for rendering options ----------------- */ |
152 | | /* Output G4 as writing through image mask; this is the default */ |
153 | | static l_int32 var_WRITE_G4_IMAGE_MASK = 1; |
154 | | /* Write date/time and lib version into pdf; this is the default */ |
155 | | static l_int32 var_WRITE_DATE_AND_VERSION = 1; |
156 | | |
157 | | #define L_SMALLBUF 256 |
158 | | #define L_BIGBUF 2048 /* must be able to hold hex colormap */ |
159 | | |
160 | | |
161 | | #ifndef NO_CONSOLE_IO |
162 | | #define DEBUG_MULTIPAGE 0 |
163 | | #endif /* ~NO_CONSOLE_IO */ |
164 | | |
165 | | |
166 | | /*---------------------------------------------------------------------* |
167 | | * Intermediate function for generating multipage pdf output * |
168 | | *---------------------------------------------------------------------*/ |
169 | | /*! |
170 | | * \brief pixConvertToPdfData() |
171 | | * |
172 | | * \param[in] pix all depths; cmap OK |
173 | | * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE, |
174 | | * L_JP2K_ENCODE |
175 | | * \param[in] quality for jpeg: 1-100; 0 for default (75) |
176 | | * for jp2k: 27-45; 0 for default (34) |
177 | | * \param[out] pdata pdf array |
178 | | * \param[out] pnbytes number of bytes in pdf array |
179 | | * \param[in] x, y location of lower-left corner of image, in pixels, |
180 | | * relative to the PostScript origin (0,0) at |
181 | | * the lower-left corner of the page) |
182 | | * \param[in] res override the resolution of the input image, in ppi; |
183 | | * use 0 to respect resolution embedded in the input |
184 | | * \param[in] title [optional] pdf title; can be null |
185 | | * \param[in,out] plpd ptr to lpd; created on the first invocation and |
186 | | * returned until last image is processed |
187 | | * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE, |
188 | | * L_LAST_IMAGE |
189 | | * \return 0 if OK, 1 on error |
190 | | * |
191 | | * <pre> |
192 | | * Notes: |
193 | | * (1) If %res == 0 and the input resolution field from the pix is 0, |
194 | | * this will use DefaultInputRes. |
195 | | * (2) This only writes %data if it is the last image to be |
196 | | * written on the page. |
197 | | * (3) See comments in convertToPdf(). |
198 | | * </pre> |
199 | | */ |
200 | | l_ok |
201 | | pixConvertToPdfData(PIX *pix, |
202 | | l_int32 type, |
203 | | l_int32 quality, |
204 | | l_uint8 **pdata, |
205 | | size_t *pnbytes, |
206 | | l_int32 x, |
207 | | l_int32 y, |
208 | | l_int32 res, |
209 | | const char *title, |
210 | | L_PDF_DATA **plpd, |
211 | | l_int32 position) |
212 | 0 | { |
213 | 0 | l_int32 pixres, w, h, ret; |
214 | 0 | l_float32 xpt, ypt, wpt, hpt; |
215 | 0 | L_COMP_DATA *cid = NULL; |
216 | 0 | L_PDF_DATA *lpd = NULL; |
217 | |
|
218 | 0 | if (!pdata) |
219 | 0 | return ERROR_INT("&data not defined", __func__, 1); |
220 | 0 | *pdata = NULL; |
221 | 0 | if (!pnbytes) |
222 | 0 | return ERROR_INT("&nbytes not defined", __func__, 1); |
223 | 0 | *pnbytes = 0; |
224 | 0 | if (!pix) |
225 | 0 | return ERROR_INT("pix not defined", __func__, 1); |
226 | 0 | if (type != L_JPEG_ENCODE && type != L_G4_ENCODE && |
227 | 0 | type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) { |
228 | 0 | selectDefaultPdfEncoding(pix, &type); |
229 | 0 | } |
230 | 0 | if (quality < 0 || quality > 100) |
231 | 0 | return ERROR_INT("invalid quality", __func__, 1); |
232 | | |
233 | 0 | if (plpd) { /* part of multi-page invocation */ |
234 | 0 | if (position == L_FIRST_IMAGE) |
235 | 0 | *plpd = NULL; |
236 | 0 | } |
237 | | |
238 | | /* Generate the compressed image data. It must NOT |
239 | | * be ascii85 encoded. */ |
240 | 0 | pixGenerateCIData(pix, type, quality, 0, &cid); |
241 | 0 | if (!cid) |
242 | 0 | return ERROR_INT("cid not made", __func__, 1); |
243 | | |
244 | | /* Get media box in pts. Guess the input image resolution |
245 | | * based on the input parameter %res, the resolution data in |
246 | | * the pix, and the size of the image. */ |
247 | 0 | pixres = cid->res; |
248 | 0 | w = cid->w; |
249 | 0 | h = cid->h; |
250 | 0 | if (res <= 0.0) |
251 | 0 | res = (pixres > 0) ? pixres : DefaultInputRes; |
252 | 0 | xpt = x * 72.f / res; |
253 | 0 | ypt = y * 72.f / res; |
254 | 0 | wpt = w * 72.f / res; |
255 | 0 | hpt = h * 72.f / res; |
256 | | |
257 | | /* Set up lpd */ |
258 | 0 | if (!plpd) { /* single image */ |
259 | 0 | if ((lpd = pdfdataCreate(title)) == NULL) |
260 | 0 | return ERROR_INT("lpd not made", __func__, 1); |
261 | 0 | } else if (position == L_FIRST_IMAGE) { /* first of multiple images */ |
262 | 0 | if ((lpd = pdfdataCreate(title)) == NULL) |
263 | 0 | return ERROR_INT("lpd not made", __func__, 1); |
264 | 0 | *plpd = lpd; |
265 | 0 | } else { /* not the first of multiple images */ |
266 | 0 | lpd = *plpd; |
267 | 0 | } |
268 | | |
269 | | /* Add the data to the lpd */ |
270 | 0 | ptraAdd(lpd->cida, cid); |
271 | 0 | lpd->n++; |
272 | 0 | ptaAddPt(lpd->xy, xpt, ypt); |
273 | 0 | ptaAddPt(lpd->wh, wpt, hpt); |
274 | | |
275 | | /* If a single image or the last of multiple images, |
276 | | * generate the pdf and destroy the lpd */ |
277 | 0 | if (!plpd || (position == L_LAST_IMAGE)) { |
278 | 0 | ret = l_generatePdf(pdata, pnbytes, lpd); |
279 | 0 | pdfdataDestroy(&lpd); |
280 | 0 | if (plpd) *plpd = NULL; |
281 | 0 | if (ret) |
282 | 0 | return ERROR_INT("pdf output not made", __func__, 1); |
283 | 0 | } |
284 | | |
285 | 0 | return 0; |
286 | 0 | } |
287 | | |
288 | | |
289 | | /*---------------------------------------------------------------------* |
290 | | * Intermediate function for generating multipage pdf output * |
291 | | *---------------------------------------------------------------------*/ |
292 | | /*! |
293 | | * \brief ptraConcatenatePdfToData() |
294 | | * |
295 | | * \param[in] pa_data ptra array of pdf strings, each for a |
296 | | * single-page pdf file |
297 | | * \param[in] sa [optional] string array of pathnames for |
298 | | * input pdf files; can be null |
299 | | * \param[out] pdata concatenated pdf data in memory |
300 | | * \param[out] pnbytes number of bytes in pdf data |
301 | | * \return 0 if OK, 1 on error |
302 | | * |
303 | | * <pre> |
304 | | * Notes: |
305 | | * (1) This only works with leptonica-formatted single-page pdf files. |
306 | | * pdf files generated by other programs will have unpredictable |
307 | | * (and usually bad) results. The requirements for each pdf file: |
308 | | * (a) The Catalog and Info objects are the first two. |
309 | | * (b) Object 3 is Pages |
310 | | * (c) Object 4 is Page |
311 | | * (d) The remaining objects are Contents, XObjects, and ColorSpace |
312 | | * (2) We remove trailers from each page, and append the full trailer |
313 | | * for all pages at the end. |
314 | | * (3) For all but the first file, remove the ID and the first 3 |
315 | | * objects (catalog, info, pages), so that each subsequent |
316 | | * file has only objects of these classes: |
317 | | * Page, Contents, XObject, ColorSpace (Indexed RGB). |
318 | | * For those objects, we substitute these refs to objects |
319 | | * in the local file: |
320 | | * Page: Parent(object 3), Contents, XObject(typically multiple) |
321 | | * XObject: [ColorSpace if indexed] |
322 | | * The Pages object on the first page (object 3) has a Kids array |
323 | | * of references to all the Page objects, with a Count equal |
324 | | * to the number of pages. Each Page object refers back to |
325 | | * this parent. |
326 | | * </pre> |
327 | | */ |
328 | | l_ok |
329 | | ptraConcatenatePdfToData(L_PTRA *pa_data, |
330 | | SARRAY *sa, |
331 | | l_uint8 **pdata, |
332 | | size_t *pnbytes) |
333 | 0 | { |
334 | 0 | char *fname, *str_pages, *str_trailer; |
335 | 0 | l_uint8 *pdfdata, *data; |
336 | 0 | l_int32 i, j, index, nobj, npages; |
337 | 0 | l_int32 *sizes, *locs; |
338 | 0 | size_t size; |
339 | 0 | L_BYTEA *bas, *bad, *bat1, *bat2; |
340 | 0 | L_DNA *da_locs, *da_sizes, *da_outlocs, *da; |
341 | 0 | L_DNAA *daa_locs; /* object locations on each page */ |
342 | 0 | NUMA *na_objs, *napage; |
343 | 0 | NUMAA *naa_objs; /* object mapping numbers to new values */ |
344 | |
|
345 | 0 | if (!pdata) |
346 | 0 | return ERROR_INT("&data not defined", __func__, 1); |
347 | 0 | *pdata = NULL; |
348 | 0 | if (!pnbytes) |
349 | 0 | return ERROR_INT("&nbytes not defined", __func__, 1); |
350 | 0 | *pnbytes = 0; |
351 | 0 | if (!pa_data) |
352 | 0 | return ERROR_INT("pa_data not defined", __func__, 1); |
353 | | |
354 | | /* Parse the files and find the object locations. |
355 | | * Remove file data that cannot be parsed. */ |
356 | 0 | ptraGetActualCount(pa_data, &npages); |
357 | 0 | daa_locs = l_dnaaCreate(npages); |
358 | 0 | for (i = 0; i < npages; i++) { |
359 | 0 | bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i); |
360 | 0 | if (parseTrailerPdf(bas, &da_locs) != 0) { |
361 | 0 | bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION); |
362 | 0 | l_byteaDestroy(&bas); |
363 | 0 | if (sa) { |
364 | 0 | fname = sarrayGetString(sa, i, L_NOCOPY); |
365 | 0 | L_ERROR("can't parse file %s; skipping\n", __func__, fname); |
366 | 0 | } else { |
367 | 0 | L_ERROR("can't parse file %d; skipping\n", __func__, i); |
368 | 0 | } |
369 | 0 | } else { |
370 | 0 | l_dnaaAddDna(daa_locs, da_locs, L_INSERT); |
371 | 0 | } |
372 | 0 | } |
373 | | |
374 | | /* Recompute npages in case some of the files were not pdf */ |
375 | 0 | ptraCompactArray(pa_data); |
376 | 0 | ptraGetActualCount(pa_data, &npages); |
377 | 0 | if (npages == 0) { |
378 | 0 | l_dnaaDestroy(&daa_locs); |
379 | 0 | return ERROR_INT("no parsable pdf files found", __func__, 1); |
380 | 0 | } |
381 | | |
382 | | /* Find the mapping from initial to final object numbers */ |
383 | 0 | naa_objs = numaaCreate(npages); /* stores final object numbers */ |
384 | 0 | napage = numaCreate(npages); /* stores "Page" object numbers */ |
385 | 0 | index = 0; |
386 | 0 | for (i = 0; i < npages; i++) { |
387 | 0 | da = l_dnaaGetDna(daa_locs, i, L_CLONE); |
388 | 0 | nobj = l_dnaGetCount(da); |
389 | 0 | if (i == 0) { |
390 | 0 | numaAddNumber(napage, 4); /* object 4 on first page */ |
391 | 0 | na_objs = numaMakeSequence(0.0, 1.0, nobj - 1); |
392 | 0 | index = nobj - 1; |
393 | 0 | } else { /* skip the first 3 objects in each file */ |
394 | 0 | numaAddNumber(napage, index); /* Page object is first we add */ |
395 | 0 | na_objs = numaMakeConstant(0.0, nobj - 1); |
396 | 0 | numaReplaceNumber(na_objs, 3, 3); /* refers to parent of all */ |
397 | 0 | for (j = 4; j < nobj - 1; j++) |
398 | 0 | numaSetValue(na_objs, j, index++); |
399 | 0 | } |
400 | 0 | numaaAddNuma(naa_objs, na_objs, L_INSERT); |
401 | 0 | l_dnaDestroy(&da); |
402 | 0 | } |
403 | | |
404 | | /* Make the Pages object (#3) */ |
405 | 0 | str_pages = generatePagesObjStringPdf(napage); |
406 | | |
407 | | /* Build the output */ |
408 | 0 | bad = l_byteaCreate(5000); |
409 | 0 | da_outlocs = l_dnaCreate(0); /* locations of all output objects */ |
410 | 0 | for (i = 0; i < npages; i++) { |
411 | 0 | bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i); |
412 | 0 | pdfdata = l_byteaGetData(bas, &size); |
413 | 0 | da_locs = l_dnaaGetDna(daa_locs, i, L_CLONE); /* locs on this page */ |
414 | 0 | na_objs = numaaGetNuma(naa_objs, i, L_CLONE); /* obj # on this page */ |
415 | 0 | nobj = l_dnaGetCount(da_locs) - 1; |
416 | 0 | da_sizes = l_dnaDiffAdjValues(da_locs); /* object sizes on this page */ |
417 | 0 | sizes = l_dnaGetIArray(da_sizes); |
418 | 0 | locs = l_dnaGetIArray(da_locs); |
419 | 0 | if (i == 0) { |
420 | 0 | l_byteaAppendData(bad, pdfdata, sizes[0]); |
421 | 0 | l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]); |
422 | 0 | l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]); |
423 | 0 | l_byteaAppendString(bad, str_pages); |
424 | 0 | for (j = 0; j < 4; j++) |
425 | 0 | l_dnaAddNumber(da_outlocs, locs[j]); |
426 | 0 | } |
427 | 0 | for (j = 4; j < nobj; j++) { |
428 | 0 | l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad)); |
429 | 0 | bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]); |
430 | 0 | bat2 = substituteObjectNumbers(bat1, na_objs); |
431 | 0 | data = l_byteaGetData(bat2, &size); |
432 | 0 | l_byteaAppendData(bad, data, size); |
433 | 0 | l_byteaDestroy(&bat1); |
434 | 0 | l_byteaDestroy(&bat2); |
435 | 0 | } |
436 | 0 | if (i == npages - 1) /* last one */ |
437 | 0 | l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad)); |
438 | 0 | LEPT_FREE(sizes); |
439 | 0 | LEPT_FREE(locs); |
440 | 0 | l_dnaDestroy(&da_locs); |
441 | 0 | numaDestroy(&na_objs); |
442 | 0 | l_dnaDestroy(&da_sizes); |
443 | 0 | } |
444 | | |
445 | | /* Add the trailer */ |
446 | 0 | str_trailer = makeTrailerStringPdf(da_outlocs); |
447 | 0 | l_byteaAppendString(bad, str_trailer); |
448 | | |
449 | | /* Transfer the output data */ |
450 | 0 | *pdata = l_byteaCopyData(bad, pnbytes); |
451 | 0 | l_byteaDestroy(&bad); |
452 | |
|
453 | | #if DEBUG_MULTIPAGE |
454 | | lept_stderr("******** object mapper **********"); |
455 | | numaaWriteStream(stderr, naa_objs); |
456 | | |
457 | | lept_stderr("******** Page object numbers ***********"); |
458 | | numaWriteStderr(napage); |
459 | | |
460 | | lept_stderr("******** Pages object ***********\n"); |
461 | | lept_stderr("%s\n", str_pages); |
462 | | #endif /* DEBUG_MULTIPAGE */ |
463 | |
|
464 | 0 | numaDestroy(&napage); |
465 | 0 | numaaDestroy(&naa_objs); |
466 | 0 | l_dnaDestroy(&da_outlocs); |
467 | 0 | l_dnaaDestroy(&daa_locs); |
468 | 0 | LEPT_FREE(str_pages); |
469 | 0 | LEPT_FREE(str_trailer); |
470 | 0 | return 0; |
471 | 0 | } |
472 | | |
473 | | |
474 | | /*---------------------------------------------------------------------* |
475 | | * Convert tiff multipage to pdf file * |
476 | | *---------------------------------------------------------------------*/ |
477 | | /*! |
478 | | * \brief convertTiffMultipageToPdf() |
479 | | * |
480 | | * \param[in] filein (tiff) |
481 | | * \param[in] fileout (pdf) |
482 | | * \return 0 if OK, 1 on error |
483 | | * |
484 | | * <pre> |
485 | | * Notes: |
486 | | * (1) A multipage tiff file can also be converted to PS, using |
487 | | * convertTiffMultipageToPS() |
488 | | * </pre> |
489 | | */ |
490 | | l_ok |
491 | | convertTiffMultipageToPdf(const char *filein, |
492 | | const char *fileout) |
493 | 0 | { |
494 | 0 | l_int32 istiff; |
495 | 0 | PIXA *pixa; |
496 | 0 | FILE *fp; |
497 | |
|
498 | 0 | if ((fp = fopenReadStream(filein)) == NULL) |
499 | 0 | return ERROR_INT_1("file not found", filein, __func__, 1); |
500 | 0 | istiff = fileFormatIsTiff(fp); |
501 | 0 | fclose(fp); |
502 | 0 | if (!istiff) |
503 | 0 | return ERROR_INT_1("file not tiff format", filein, __func__, 1); |
504 | | |
505 | 0 | pixa = pixaReadMultipageTiff(filein); |
506 | 0 | pixaConvertToPdf(pixa, 0, 1.0, 0, 0, "weasel2", fileout); |
507 | 0 | pixaDestroy(&pixa); |
508 | 0 | return 0; |
509 | 0 | } |
510 | | |
511 | | |
512 | | /*---------------------------------------------------------------------* |
513 | | * CID-based operations * |
514 | | *---------------------------------------------------------------------*/ |
515 | | /*! |
516 | | * \brief l_generateCIDataForPdf() |
517 | | * |
518 | | * \param[in] fname [optional] can be null |
519 | | * \param[in] pix [optional] can be null |
520 | | * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) |
521 | | * for jp2k if transcoded: 27-45; 0 for default (34) |
522 | | * \param[out] pcid compressed data |
523 | | * \return 0 if OK, 1 on error |
524 | | * |
525 | | * <pre> |
526 | | * Notes: |
527 | | * (1) You must set either filename or pix. |
528 | | * (2) Given an image file and optionally a pix raster of that data, |
529 | | * this provides a CID that is compatible with PDF, preferably |
530 | | * without transcoding. |
531 | | * (3) The pix is included for efficiency, in case transcoding |
532 | | * is required and the pix is available to the caller. |
533 | | * (4) We don't try to open files named "stdin" or "-" for Tesseract |
534 | | * compatibility reasons. We may remove this restriction |
535 | | * in the future. |
536 | | * (5) Note that tiff-g4 must be transcoded to properly handle byte |
537 | | * order and perhaps photometry (e.g., min-is-black). For a |
538 | | * multipage tiff file, data will only be extracted from the |
539 | | * first page, so this should not be invoked. |
540 | | * </pre> |
541 | | */ |
542 | | l_ok |
543 | | l_generateCIDataForPdf(const char *fname, |
544 | | PIX *pix, |
545 | | l_int32 quality, |
546 | | L_COMP_DATA **pcid) |
547 | 0 | { |
548 | 0 | l_int32 format, type; |
549 | 0 | L_COMP_DATA *cid; |
550 | 0 | PIX *pixt; |
551 | |
|
552 | 0 | if (!pcid) |
553 | 0 | return ERROR_INT("&cid not defined", __func__, 1); |
554 | 0 | *pcid = cid = NULL; |
555 | 0 | if (!fname && !pix) |
556 | 0 | return ERROR_INT("neither fname nor pix are defined", __func__, 1); |
557 | | |
558 | | /* If a compressed file is given that is not 'stdin', see if we |
559 | | * can generate the pdf output without transcoding. */ |
560 | 0 | if (fname && strcmp(fname, "-") != 0 && strcmp(fname, "stdin") != 0) { |
561 | 0 | findFileFormat(fname, &format); |
562 | 0 | if (format == IFF_UNKNOWN) |
563 | 0 | L_WARNING("file %s format is unknown\n", __func__, fname); |
564 | 0 | if (format == IFF_PS || format == IFF_LPDF) { |
565 | 0 | L_ERROR("file %s is unsupported format %d\n", |
566 | 0 | __func__, fname, format); |
567 | 0 | return 1; |
568 | 0 | } |
569 | 0 | if (format == IFF_JFIF_JPEG) { |
570 | 0 | cid = l_generateJpegData(fname, 0); |
571 | 0 | } else if (format == IFF_JP2) { |
572 | 0 | cid = l_generateJp2kData(fname); |
573 | 0 | } else if (format == IFF_PNG) { |
574 | 0 | cid = l_generateFlateDataPdf(fname, pix); |
575 | 0 | } |
576 | 0 | } |
577 | | |
578 | | /* Otherwise, use the pix to generate the pdf output */ |
579 | 0 | if (!cid) { |
580 | 0 | if (!pix) |
581 | 0 | pixt = pixRead(fname); |
582 | 0 | else |
583 | 0 | pixt = pixClone(pix); |
584 | 0 | if (!pixt) |
585 | 0 | return ERROR_INT("pixt not made", __func__, 1); |
586 | 0 | if (selectDefaultPdfEncoding(pixt, &type)) { |
587 | 0 | pixDestroy(&pixt); |
588 | 0 | return 1; |
589 | 0 | } |
590 | 0 | pixGenerateCIData(pixt, type, quality, 0, &cid); |
591 | 0 | pixDestroy(&pixt); |
592 | 0 | if (!cid) |
593 | 0 | return ERROR_INT("cid not made from pix", __func__, 1); |
594 | 0 | } |
595 | 0 | *pcid = cid; |
596 | 0 | return 0; |
597 | 0 | } |
598 | | |
599 | | |
600 | | /*! |
601 | | * \brief l_generateCIData() |
602 | | * |
603 | | * \param[in] fname |
604 | | * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE, |
605 | | * L_JP2K_ENCODE |
606 | | * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) |
607 | | * for jp2k if transcoded: 27-45; 0 for default (34) |
608 | | * \param[in] ascii85 0 for binary; 1 for ascii85-encoded |
609 | | * \param[out] pcid compressed data |
610 | | * \return 0 if OK, 1 on error |
611 | | * |
612 | | * <pre> |
613 | | * Notes: |
614 | | * (1) This can be used for both PostScript and pdf. |
615 | | * (1) Set ascii85: |
616 | | * ~ 0 for binary data (PDF only) |
617 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
618 | | * (2) This attempts to compress according to the requested type. |
619 | | * If this can't be done, it falls back to ordinary flate encoding. |
620 | | * (3) This differs from l_generateCIDataForPdf(), which determines |
621 | | * the file format and only works for pdf. |
622 | | * </pre> |
623 | | */ |
624 | | l_ok |
625 | | l_generateCIData(const char *fname, |
626 | | l_int32 type, |
627 | | l_int32 quality, |
628 | | l_int32 ascii85, |
629 | | L_COMP_DATA **pcid) |
630 | 0 | { |
631 | 0 | l_int32 format, d, bps, spp, iscmap; |
632 | 0 | L_COMP_DATA *cid; |
633 | 0 | PIX *pix; |
634 | |
|
635 | 0 | if (!pcid) |
636 | 0 | return ERROR_INT("&cid not defined", __func__, 1); |
637 | 0 | *pcid = NULL; |
638 | 0 | if (!fname) |
639 | 0 | return ERROR_INT("fname not defined", __func__, 1); |
640 | 0 | if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && |
641 | 0 | type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) |
642 | 0 | return ERROR_INT("invalid conversion type", __func__, 1); |
643 | 0 | if (ascii85 != 0 && ascii85 != 1) |
644 | 0 | return ERROR_INT("invalid ascii85", __func__, 1); |
645 | | |
646 | | /* Sanity check on requested encoding */ |
647 | 0 | pixReadHeader(fname, &format, NULL, NULL, &bps, &spp, &iscmap); |
648 | 0 | d = bps * spp; |
649 | 0 | if (d == 24) d = 32; |
650 | 0 | if (iscmap && type != L_FLATE_ENCODE) { |
651 | 0 | L_WARNING("pixs has cmap; using flate encoding\n", __func__); |
652 | 0 | type = L_FLATE_ENCODE; |
653 | 0 | } else if (d < 8 && type == L_JPEG_ENCODE) { |
654 | 0 | L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__); |
655 | 0 | type = L_FLATE_ENCODE; |
656 | 0 | } else if (d < 8 && type == L_JP2K_ENCODE) { |
657 | 0 | L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__); |
658 | 0 | type = L_FLATE_ENCODE; |
659 | 0 | } else if (d > 1 && type == L_G4_ENCODE) { |
660 | 0 | L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__); |
661 | 0 | type = L_FLATE_ENCODE; |
662 | 0 | } |
663 | |
|
664 | 0 | if (type == L_JPEG_ENCODE) { |
665 | 0 | if (format == IFF_JFIF_JPEG) { /* do not transcode */ |
666 | 0 | cid = l_generateJpegData(fname, ascii85); |
667 | 0 | } else { |
668 | 0 | if ((pix = pixRead(fname)) == NULL) |
669 | 0 | return ERROR_INT("pix not returned for JPEG", __func__, 1); |
670 | 0 | cid = pixGenerateJpegData(pix, ascii85, quality); |
671 | 0 | pixDestroy(&pix); |
672 | 0 | } |
673 | 0 | if (!cid) |
674 | 0 | return ERROR_INT("jpeg data not made", __func__, 1); |
675 | 0 | } else if (type == L_JP2K_ENCODE) { |
676 | 0 | if (format == IFF_JP2) { /* do not transcode */ |
677 | 0 | cid = l_generateJp2kData(fname); |
678 | 0 | } else { |
679 | 0 | if ((pix = pixRead(fname)) == NULL) |
680 | 0 | return ERROR_INT("pix not returned for JP2K", __func__, 1); |
681 | 0 | cid = pixGenerateJp2kData(pix, quality); |
682 | 0 | pixDestroy(&pix); |
683 | 0 | } |
684 | 0 | if (!cid) |
685 | 0 | return ERROR_INT("jp2k data not made", __func__, 1); |
686 | 0 | } else if (type == L_G4_ENCODE) { |
687 | 0 | if ((pix = pixRead(fname)) == NULL) |
688 | 0 | return ERROR_INT("pix not returned for G4", __func__, 1); |
689 | 0 | cid = pixGenerateG4Data(pix, ascii85); |
690 | 0 | pixDestroy(&pix); |
691 | 0 | if (!cid) |
692 | 0 | return ERROR_INT("g4 data not made", __func__, 1); |
693 | 0 | } else if (type == L_FLATE_ENCODE) { |
694 | 0 | if ((cid = l_generateFlateData(fname, ascii85)) == NULL) |
695 | 0 | return ERROR_INT("flate data not made", __func__, 1); |
696 | 0 | } else { |
697 | 0 | return ERROR_INT("invalid conversion type", __func__, 1); |
698 | 0 | } |
699 | 0 | *pcid = cid; |
700 | |
|
701 | 0 | return 0; |
702 | 0 | } |
703 | | |
704 | | |
705 | | /*---------------------------------------------------------------------* |
706 | | * Low-level CID-based operations * |
707 | | *---------------------------------------------------------------------*/ |
708 | | /*! |
709 | | * \brief l_generateFlateDataPdf() |
710 | | * |
711 | | * \param[in] fname preferably png |
712 | | * \param[in] pixs [optional] can be null |
713 | | * \return cid containing png data, or NULL on error |
714 | | * |
715 | | * <pre> |
716 | | * Notes: |
717 | | * (1) If you hand this a png file, you are going to get |
718 | | * png predictors embedded in the flate data. So it has |
719 | | * come to this. http://xkcd.com/1022/ |
720 | | * (2) Exception: if the png is interlaced or if it is RGBA, |
721 | | * it will be transcoded. |
722 | | * (3) If transcoding is required, this will not have to read from |
723 | | * file if a pix is input. |
724 | | * </pre> |
725 | | */ |
726 | | L_COMP_DATA * |
727 | | l_generateFlateDataPdf(const char *fname, |
728 | | PIX *pixs) |
729 | 0 | { |
730 | 0 | l_uint8 *pngcomp = NULL; /* entire PNG compressed file */ |
731 | 0 | l_uint8 *datacomp = NULL; /* gzipped raster data */ |
732 | 0 | l_uint8 *cmapdata = NULL; /* uncompressed colormap */ |
733 | 0 | char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */ |
734 | 0 | l_uint32 i, j, n; |
735 | 0 | l_int32 format, interlaced; |
736 | 0 | l_int32 ncolors; /* in colormap */ |
737 | 0 | l_int32 bps; /* bits/sample: usually 8 */ |
738 | 0 | l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb; 4-rgba */ |
739 | 0 | l_int32 w, h, cmapflag; |
740 | 0 | l_int32 xres, yres; |
741 | 0 | size_t nbytescomp = 0, nbytespng = 0; |
742 | 0 | FILE *fp; |
743 | 0 | L_COMP_DATA *cid; |
744 | 0 | PIX *pix; |
745 | 0 | PIXCMAP *cmap = NULL; |
746 | |
|
747 | 0 | if (!fname) |
748 | 0 | return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); |
749 | | |
750 | 0 | findFileFormat(fname, &format); |
751 | 0 | spp = 0; /* init to spp != 4 if not png */ |
752 | 0 | interlaced = 0; /* initialize to no interlacing */ |
753 | 0 | bps = 0; /* initialize to a nonsense value */ |
754 | 0 | if (format == IFF_PNG) { |
755 | 0 | isPngInterlaced(fname, &interlaced); |
756 | 0 | if (readHeaderPng(fname, NULL, NULL, &bps, &spp, NULL)) |
757 | 0 | return (L_COMP_DATA *)ERROR_PTR("bad png input", __func__, NULL); |
758 | 0 | } |
759 | | |
760 | | /* PDF is capable of inlining some types of PNG files, but not all |
761 | | of them. We need to transcode anything with interlacing, an |
762 | | alpha channel, or 1 bpp (which would otherwise be photo-inverted). |
763 | | |
764 | | Note: any PNG image file with an alpha channel is converted on |
765 | | reading to RGBA (spp == 4). This includes the (gray + alpha) format |
766 | | with spp == 2. Because of the conversion, readHeaderPng() gives |
767 | | spp = 2, whereas pixGetSpp() gives spp = 4 on the converted pix. */ |
768 | 0 | if (format != IFF_PNG || |
769 | 0 | (format == IFF_PNG && (interlaced || bps == 1 || spp == 4 || spp == 2))) |
770 | 0 | { /* lgtm+ analyzer needed the logic expanded */ |
771 | 0 | if (!pixs) |
772 | 0 | pix = pixRead(fname); |
773 | 0 | else |
774 | 0 | pix = pixClone(pixs); |
775 | 0 | if (!pix) |
776 | 0 | return (L_COMP_DATA *)ERROR_PTR("pix not made", __func__, NULL); |
777 | 0 | cid = pixGenerateFlateData(pix, 0); |
778 | 0 | pixDestroy(&pix); |
779 | 0 | return cid; |
780 | 0 | } |
781 | | |
782 | | /* It's png. Generate the pdf data without transcoding. |
783 | | * Implementation by Jeff Breidenbach. |
784 | | * First, read the metadata */ |
785 | 0 | if ((fp = fopenReadStream(fname)) == NULL) |
786 | 0 | return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", |
787 | 0 | fname, __func__, NULL); |
788 | 0 | freadHeaderPng(fp, &w, &h, &bps, &spp, &cmapflag); |
789 | 0 | fgetPngResolution(fp, &xres, &yres); |
790 | 0 | fclose(fp); |
791 | | |
792 | | /* We get pdf corruption when inlining the data from 16 bpp png. */ |
793 | 0 | if (bps == 16) |
794 | 0 | return l_generateFlateData(fname, 0); |
795 | | |
796 | | /* Read the entire png file */ |
797 | 0 | if ((pngcomp = l_binaryRead(fname, &nbytespng)) == NULL) |
798 | 0 | return (L_COMP_DATA *)ERROR_PTR_1("unable to read file", |
799 | 0 | fname, __func__, NULL); |
800 | | |
801 | | /* Extract flate data, copying portions of it to memory, including |
802 | | * the predictor information in a byte at the beginning of each |
803 | | * raster line. The flate data makes up the vast majority of |
804 | | * the png file, so after extraction we expect datacomp to |
805 | | * be nearly full (i.e., nbytescomp will be only slightly less |
806 | | * than nbytespng). Also extract the colormap if present. */ |
807 | 0 | if ((datacomp = (l_uint8 *)LEPT_CALLOC(1, nbytespng)) == NULL) { |
808 | 0 | LEPT_FREE(pngcomp); |
809 | 0 | return (L_COMP_DATA *)ERROR_PTR("unable to allocate memory", |
810 | 0 | __func__, NULL); |
811 | 0 | } |
812 | | |
813 | | /* Parse the png file. Each chunk consists of: |
814 | | * length: 4 bytes |
815 | | * name: 4 bytes (e.g., "IDAT") |
816 | | * data: n bytes |
817 | | * CRC: 4 bytes |
818 | | * Start at the beginning of the data section of the first chunk, |
819 | | * byte 16, because the png file begins with 8 bytes of header, |
820 | | * followed by the first 8 bytes of the first chunk |
821 | | * (length and name). On each loop, increment by 12 bytes to |
822 | | * skip over the CRC, length and name of the next chunk. */ |
823 | 0 | for (i = 16; i < nbytespng; i += 12) { /* do each successive chunk */ |
824 | | /* Get the chunk length */ |
825 | 0 | n = pngcomp[i - 8] << 24; |
826 | 0 | n += pngcomp[i - 7] << 16; |
827 | 0 | n += pngcomp[i - 6] << 8; |
828 | 0 | n += pngcomp[i - 5] << 0; |
829 | 0 | if (n >= nbytespng - i) { /* "n + i" can overflow */ |
830 | 0 | LEPT_FREE(pngcomp); |
831 | 0 | LEPT_FREE(datacomp); |
832 | 0 | pixcmapDestroy(&cmap); |
833 | 0 | L_ERROR("invalid png: i = %d, n = %d, nbytes = %zu\n", __func__, |
834 | 0 | i, n, nbytespng); |
835 | 0 | return NULL; |
836 | 0 | } |
837 | | |
838 | | /* Is it a data chunk? */ |
839 | 0 | if (memcmp(pngcomp + i - 4, "IDAT", 4) == 0) { |
840 | 0 | memcpy(datacomp + nbytescomp, pngcomp + i, n); |
841 | 0 | nbytescomp += n; |
842 | 0 | } |
843 | | |
844 | | /* Is it a palette chunk? */ |
845 | 0 | if (cmapflag && !cmap && |
846 | 0 | memcmp(pngcomp + i - 4, "PLTE", 4) == 0) { |
847 | 0 | if ((n / 3) > (1 << bps)) { |
848 | 0 | LEPT_FREE(pngcomp); |
849 | 0 | LEPT_FREE(datacomp); |
850 | 0 | pixcmapDestroy(&cmap); |
851 | 0 | L_ERROR("invalid png: i = %d, n = %d, cmapsize = %d\n", |
852 | 0 | __func__, i, n, (1 << bps)); |
853 | 0 | return NULL; |
854 | 0 | } |
855 | 0 | cmap = pixcmapCreate(bps); |
856 | 0 | for (j = i; j < i + n; j += 3) { |
857 | 0 | pixcmapAddColor(cmap, pngcomp[j], pngcomp[j + 1], |
858 | 0 | pngcomp[j + 2]); |
859 | 0 | } |
860 | 0 | } |
861 | 0 | i += n; /* move to the end of the data chunk */ |
862 | 0 | } |
863 | 0 | LEPT_FREE(pngcomp); |
864 | |
|
865 | 0 | if (nbytescomp == 0) { |
866 | 0 | LEPT_FREE(datacomp); |
867 | 0 | pixcmapDestroy(&cmap); |
868 | 0 | return (L_COMP_DATA *)ERROR_PTR("invalid PNG file", __func__, NULL); |
869 | 0 | } |
870 | | |
871 | | /* Extract and encode the colormap data as hexascii */ |
872 | 0 | ncolors = 0; |
873 | 0 | if (cmap) { |
874 | 0 | pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata); |
875 | 0 | pixcmapDestroy(&cmap); |
876 | 0 | if (!cmapdata) { |
877 | 0 | LEPT_FREE(datacomp); |
878 | 0 | return (L_COMP_DATA *)ERROR_PTR("cmapdata not made", |
879 | 0 | __func__, NULL); |
880 | 0 | } |
881 | 0 | cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors); |
882 | 0 | LEPT_FREE(cmapdata); |
883 | 0 | } |
884 | | |
885 | | /* Note that this is the only situation where the predictor |
886 | | * field of the CID is set to 1. Adobe's predictor values on |
887 | | * p. 76 of pdf_reference_1-7.pdf give 1 for no predictor and |
888 | | * 10-14 for inline predictors, the specifics of which are |
889 | | * ignored by the pdf interpreter, which just needs to know that |
890 | | * the first byte on each compressed scanline is some predictor |
891 | | * whose type can be inferred from the byte itself. */ |
892 | 0 | cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); |
893 | 0 | cid->datacomp = datacomp; |
894 | 0 | cid->type = L_FLATE_ENCODE; |
895 | 0 | cid->cmapdatahex = cmapdatahex; |
896 | 0 | cid->nbytescomp = nbytescomp; |
897 | 0 | cid->ncolors = ncolors; |
898 | 0 | cid->predictor = TRUE; |
899 | 0 | cid->w = w; |
900 | 0 | cid->h = h; |
901 | 0 | cid->bps = bps; |
902 | 0 | cid->spp = spp; |
903 | 0 | cid->res = xres; |
904 | 0 | return cid; |
905 | 0 | } |
906 | | |
907 | | |
908 | | /*! |
909 | | * \brief l_generateJpegData() |
910 | | * |
911 | | * \param[in] fname of jpeg file |
912 | | * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg |
913 | | * \return cid containing jpeg data, or NULL on error |
914 | | * |
915 | | * <pre> |
916 | | * Notes: |
917 | | * (1) Set ascii85flag: |
918 | | * ~ 0 for binary data (PDF only) |
919 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
920 | | * (2) Most of this function is repeated in l_generateJpegMemData(), |
921 | | * which is required in pixacompFastConvertToPdfData(). |
922 | | * </pre> |
923 | | */ |
924 | | L_COMP_DATA * |
925 | | l_generateJpegData(const char *fname, |
926 | | l_int32 ascii85flag) |
927 | 0 | { |
928 | 0 | char *data85 = NULL; /* ascii85 encoded jpeg compressed file */ |
929 | 0 | l_uint8 *data = NULL; |
930 | 0 | l_int32 w, h, xres, yres, bps, spp; |
931 | 0 | size_t nbytes, nbytes85; |
932 | 0 | L_COMP_DATA *cid; |
933 | 0 | FILE *fp; |
934 | |
|
935 | 0 | if (!fname) |
936 | 0 | return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); |
937 | | |
938 | 0 | if (ascii85flag != 0 && ascii85flag != 1) |
939 | 0 | return (L_COMP_DATA *)ERROR_PTR("wrong ascii85flags", __func__, NULL); |
940 | | |
941 | | /* Read the metadata */ |
942 | 0 | if (readHeaderJpeg(fname, &w, &h, &spp, NULL, NULL)) |
943 | 0 | return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL); |
944 | 0 | bps = 8; |
945 | 0 | if ((fp = fopenReadStream(fname)) == NULL) |
946 | 0 | return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", |
947 | 0 | fname, __func__, NULL); |
948 | 0 | fgetJpegResolution(fp, &xres, &yres); |
949 | 0 | fclose(fp); |
950 | | |
951 | | /* Read the entire jpeg file. The returned jpeg data in memory |
952 | | * starts with ffd8 and ends with ffd9 */ |
953 | 0 | if ((data = l_binaryRead(fname, &nbytes)) == NULL) |
954 | 0 | return (L_COMP_DATA *)ERROR_PTR_1("data not extracted", |
955 | 0 | fname, __func__, NULL); |
956 | | |
957 | | /* Optionally, encode the compressed data */ |
958 | 0 | if (ascii85flag == 1) { |
959 | 0 | data85 = encodeAscii85(data, nbytes, &nbytes85); |
960 | 0 | LEPT_FREE(data); |
961 | 0 | if (!data85) |
962 | 0 | return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL); |
963 | 0 | else |
964 | 0 | data85[nbytes85 - 1] = '\0'; /* remove the newline */ |
965 | 0 | } |
966 | | |
967 | 0 | cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); |
968 | 0 | if (ascii85flag == 0) { |
969 | 0 | cid->datacomp = data; |
970 | 0 | } else { /* ascii85 */ |
971 | 0 | cid->data85 = data85; |
972 | 0 | cid->nbytes85 = nbytes85; |
973 | 0 | } |
974 | 0 | cid->type = L_JPEG_ENCODE; |
975 | 0 | cid->nbytescomp = nbytes; |
976 | 0 | cid->w = w; |
977 | 0 | cid->h = h; |
978 | 0 | cid->bps = bps; |
979 | 0 | cid->spp = spp; |
980 | 0 | cid->res = xres; |
981 | 0 | return cid; |
982 | 0 | } |
983 | | |
984 | | |
985 | | /*! |
986 | | * \brief l_generateJpegDataMem() |
987 | | * |
988 | | * \param[in] data of jpeg-encoded file |
989 | | * \param[in] nbytes size of jpeg-encoded file |
990 | | * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg |
991 | | * \return cid containing jpeg data, or NULL on error |
992 | | * |
993 | | * <pre> |
994 | | * Notes: |
995 | | * (1) Set ascii85flag: |
996 | | * ~ 0 for binary data (PDF only) |
997 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
998 | | * </pre> |
999 | | */ |
1000 | | L_COMP_DATA * |
1001 | | l_generateJpegDataMem(l_uint8 *data, |
1002 | | size_t nbytes, |
1003 | | l_int32 ascii85flag) |
1004 | 0 | { |
1005 | 0 | char *data85 = NULL; /* ascii85 encoded jpeg compressed file */ |
1006 | 0 | l_int32 w, h, xres, yres, bps, spp; |
1007 | 0 | size_t nbytes85; |
1008 | 0 | L_COMP_DATA *cid; |
1009 | |
|
1010 | 0 | if (!data) |
1011 | 0 | return (L_COMP_DATA *)ERROR_PTR("data not defined", __func__, NULL); |
1012 | | |
1013 | | /* Read the metadata */ |
1014 | 0 | if (readHeaderMemJpeg(data, nbytes, &w, &h, &spp, NULL, NULL)) { |
1015 | 0 | LEPT_FREE(data); |
1016 | 0 | return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL); |
1017 | 0 | } |
1018 | 0 | bps = 8; |
1019 | 0 | readResolutionMemJpeg(data, nbytes, &xres, &yres); |
1020 | | |
1021 | | /* Optionally, encode the compressed data */ |
1022 | 0 | if (ascii85flag == 1) { |
1023 | 0 | data85 = encodeAscii85(data, nbytes, &nbytes85); |
1024 | 0 | LEPT_FREE(data); |
1025 | 0 | if (!data85) |
1026 | 0 | return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL); |
1027 | 0 | else |
1028 | 0 | data85[nbytes85 - 1] = '\0'; /* remove the newline */ |
1029 | 0 | } |
1030 | | |
1031 | 0 | cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); |
1032 | 0 | if (ascii85flag == 0) { |
1033 | 0 | cid->datacomp = data; |
1034 | 0 | } else { /* ascii85 */ |
1035 | 0 | cid->data85 = data85; |
1036 | 0 | cid->nbytes85 = nbytes85; |
1037 | 0 | } |
1038 | 0 | cid->type = L_JPEG_ENCODE; |
1039 | 0 | cid->nbytescomp = nbytes; |
1040 | 0 | cid->w = w; |
1041 | 0 | cid->h = h; |
1042 | 0 | cid->bps = bps; |
1043 | 0 | cid->spp = spp; |
1044 | 0 | cid->res = xres; |
1045 | 0 | return cid; |
1046 | 0 | } |
1047 | | |
1048 | | |
1049 | | /*! |
1050 | | * \brief l_generateJp2kData() |
1051 | | * |
1052 | | * \param[in] fname of jp2k file |
1053 | | * \return cid containing jp2k data, or NULL on error |
1054 | | * |
1055 | | * <pre> |
1056 | | * Notes: |
1057 | | * (1) This is only called after the file is verified to be jp2k. |
1058 | | * </pre> |
1059 | | */ |
1060 | | static L_COMP_DATA * |
1061 | | l_generateJp2kData(const char *fname) |
1062 | 0 | { |
1063 | 0 | l_int32 w, h, bps, spp, xres, yres; |
1064 | 0 | size_t nbytes; |
1065 | 0 | L_COMP_DATA *cid; |
1066 | 0 | FILE *fp; |
1067 | |
|
1068 | 0 | if (!fname) |
1069 | 0 | return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); |
1070 | | |
1071 | 0 | if (readHeaderJp2k(fname, &w, &h, &bps, &spp, NULL)) |
1072 | 0 | return (L_COMP_DATA *)ERROR_PTR("bad jp2k metadata", __func__, NULL); |
1073 | | |
1074 | | /* The returned jp2k data in memory is the entire jp2k file */ |
1075 | 0 | cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); |
1076 | 0 | if ((cid->datacomp = l_binaryRead(fname, &nbytes)) == NULL) { |
1077 | 0 | l_CIDataDestroy(&cid); |
1078 | 0 | return (L_COMP_DATA *)ERROR_PTR("data not extracted", __func__, NULL); |
1079 | 0 | } |
1080 | | |
1081 | 0 | xres = yres = 0; |
1082 | 0 | if ((fp = fopenReadStream(fname)) != NULL) { |
1083 | 0 | fgetJp2kResolution(fp, &xres, &yres); |
1084 | 0 | fclose(fp); |
1085 | 0 | } |
1086 | 0 | cid->type = L_JP2K_ENCODE; |
1087 | 0 | cid->nbytescomp = nbytes; |
1088 | 0 | cid->w = w; |
1089 | 0 | cid->h = h; |
1090 | 0 | cid->bps = bps; |
1091 | 0 | cid->spp = spp; |
1092 | 0 | cid->res = xres; |
1093 | 0 | return cid; |
1094 | 0 | } |
1095 | | |
1096 | | |
1097 | | /*! |
1098 | | * \brief l_generateG4Data() |
1099 | | * |
1100 | | * \param[in] fname of g4 compressed file |
1101 | | * \param[in] ascii85flag 0 for g4 compressed; 1 for ascii85-encoded g4 |
1102 | | * \return cid g4 compressed image data, or NULL on error |
1103 | | * |
1104 | | * <pre> |
1105 | | * Notes: |
1106 | | * (1) Set ascii85flag: |
1107 | | * ~ 0 for binary data (PDF only) |
1108 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
1109 | | * (2) This does not work for multipage tiff files. |
1110 | | * </pre> |
1111 | | */ |
1112 | | L_COMP_DATA * |
1113 | | l_generateG4Data(const char *fname, |
1114 | | l_int32 ascii85flag) |
1115 | 0 | { |
1116 | 0 | l_uint8 *datacomp = NULL; /* g4 compressed raster data */ |
1117 | 0 | char *data85 = NULL; /* ascii85 encoded g4 compressed data */ |
1118 | 0 | l_int32 w, h, xres, yres, npages; |
1119 | 0 | l_int32 minisblack; /* TRUE or FALSE */ |
1120 | 0 | size_t nbytes85, nbytescomp; |
1121 | 0 | L_COMP_DATA *cid; |
1122 | 0 | FILE *fp; |
1123 | |
|
1124 | 0 | if (!fname) |
1125 | 0 | return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); |
1126 | | |
1127 | | /* Make sure this is a single page tiff file */ |
1128 | 0 | if ((fp = fopenReadStream(fname)) == NULL) |
1129 | 0 | return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", |
1130 | 0 | fname, __func__, NULL); |
1131 | 0 | tiffGetCount(fp, &npages); |
1132 | 0 | fclose(fp); |
1133 | 0 | if (npages != 1) { |
1134 | 0 | L_ERROR(" %d page tiff; only works with 1 page (file: %s)\n", __func__, npages, fname); |
1135 | 0 | return NULL; |
1136 | 0 | } |
1137 | | |
1138 | | /* Read the resolution */ |
1139 | 0 | if ((fp = fopenReadStream(fname)) == NULL) |
1140 | 0 | return (L_COMP_DATA *)ERROR_PTR_1("stream not opened", |
1141 | 0 | fname, __func__, NULL); |
1142 | 0 | getTiffResolution(fp, &xres, &yres); |
1143 | 0 | fclose(fp); |
1144 | | |
1145 | | /* The returned ccitt g4 data in memory is the block of |
1146 | | * bytes in the tiff file, starting after 8 bytes and |
1147 | | * ending before the directory. */ |
1148 | 0 | if (extractG4DataFromFile(fname, &datacomp, &nbytescomp, |
1149 | 0 | &w, &h, &minisblack)) { |
1150 | 0 | return (L_COMP_DATA *)ERROR_PTR_1("datacomp not extracted", |
1151 | 0 | fname, __func__, NULL); |
1152 | 0 | } |
1153 | | |
1154 | | /* Optionally, encode the compressed data */ |
1155 | 0 | if (ascii85flag == 1) { |
1156 | 0 | data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85); |
1157 | 0 | LEPT_FREE(datacomp); |
1158 | 0 | if (!data85) |
1159 | 0 | return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL); |
1160 | 0 | else |
1161 | 0 | data85[nbytes85 - 1] = '\0'; /* remove the newline */ |
1162 | 0 | } |
1163 | | |
1164 | 0 | cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); |
1165 | 0 | if (ascii85flag == 0) { |
1166 | 0 | cid->datacomp = datacomp; |
1167 | 0 | } else { /* ascii85 */ |
1168 | 0 | cid->data85 = data85; |
1169 | 0 | cid->nbytes85 = nbytes85; |
1170 | 0 | } |
1171 | 0 | cid->type = L_G4_ENCODE; |
1172 | 0 | cid->nbytescomp = nbytescomp; |
1173 | 0 | cid->w = w; |
1174 | 0 | cid->h = h; |
1175 | 0 | cid->bps = 1; |
1176 | 0 | cid->spp = 1; |
1177 | 0 | cid->minisblack = minisblack; |
1178 | 0 | cid->res = xres; |
1179 | 0 | return cid; |
1180 | 0 | } |
1181 | | |
1182 | | |
1183 | | /*! |
1184 | | * \brief pixGenerateCIData() |
1185 | | * |
1186 | | * \param[in] pixs 8 or 32 bpp, no colormap |
1187 | | * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE or |
1188 | | * L_JP2K_ENCODE |
1189 | | * \param[in] quality for jpeg if transcoded: 1-100; 0 for default (75) |
1190 | | * for jp2k if transcoded: 27-45; 0 for default (34) |
1191 | | * \param[in] ascii85 0 for binary; 1 for ascii85-encoded |
1192 | | * \param[out] pcid compressed data |
1193 | | * \return 0 if OK, 1 on error |
1194 | | * |
1195 | | * <pre> |
1196 | | * Notes: |
1197 | | * (1) Set ascii85: |
1198 | | * ~ 0 for binary data (PDF only) |
1199 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
1200 | | * (2) Do not accept images with an asperity ratio greater than 10. |
1201 | | * </pre> |
1202 | | */ |
1203 | | l_ok |
1204 | | pixGenerateCIData(PIX *pixs, |
1205 | | l_int32 type, |
1206 | | l_int32 quality, |
1207 | | l_int32 ascii85, |
1208 | | L_COMP_DATA **pcid) |
1209 | 0 | { |
1210 | 0 | l_int32 w, h, d, maxAsp; |
1211 | 0 | PIXCMAP *cmap; |
1212 | |
|
1213 | 0 | if (!pcid) |
1214 | 0 | return ERROR_INT("&cid not defined", __func__, 1); |
1215 | 0 | *pcid = NULL; |
1216 | 0 | if (!pixs) |
1217 | 0 | return ERROR_INT("pixs not defined", __func__, 1); |
1218 | 0 | if (type != L_G4_ENCODE && type != L_JPEG_ENCODE && |
1219 | 0 | type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) { |
1220 | 0 | selectDefaultPdfEncoding(pixs, &type); |
1221 | 0 | } |
1222 | 0 | if (ascii85 != 0 && ascii85 != 1) |
1223 | 0 | return ERROR_INT("invalid ascii85", __func__, 1); |
1224 | 0 | pixGetDimensions(pixs, &w, &h, NULL); |
1225 | 0 | if (w == 0 || h == 0) |
1226 | 0 | return ERROR_INT("invalid w or h", __func__, 1); |
1227 | 0 | maxAsp = L_MAX(w / h, h / w); |
1228 | 0 | if (maxAsp > 10) |
1229 | 0 | return ERROR_INT("max asperity > 10", __func__, 1); |
1230 | | |
1231 | | /* Conditionally modify the encoding type if libz is |
1232 | | * available and the requested library is missing. */ |
1233 | 0 | #if defined(HAVE_LIBZ) |
1234 | | # if !defined(HAVE_LIBJPEG) |
1235 | | if (type == L_JPEG_ENCODE) { |
1236 | | L_WARNING("no libjpeg; using flate encoding\n", __func__); |
1237 | | type = L_FLATE_ENCODE; |
1238 | | } |
1239 | | # endif /* !defined(HAVE_LIBJPEG) */ |
1240 | 0 | # if !defined(HAVE_LIBJP2K) |
1241 | 0 | if (type == L_JP2K_ENCODE) { |
1242 | 0 | L_WARNING("no libjp2k; using flate encoding\n", __func__); |
1243 | 0 | type = L_FLATE_ENCODE; |
1244 | 0 | } |
1245 | 0 | # endif /* !defined(HAVE_LIBJP2K) */ |
1246 | | # if !defined(HAVE_LIBTIFF) |
1247 | | if (type == L_G4_ENCODE) { |
1248 | | L_WARNING("no libtiff; using flate encoding\n", __func__); |
1249 | | type = L_FLATE_ENCODE; |
1250 | | } |
1251 | | # endif /* !defined(HAVE_LIBTIFF) */ |
1252 | 0 | #endif /* defined(HAVE_LIBZ) */ |
1253 | | |
1254 | | /* Sanity check on requested encoding */ |
1255 | 0 | d = pixGetDepth(pixs); |
1256 | 0 | cmap = pixGetColormap(pixs); |
1257 | 0 | if (cmap && type != L_FLATE_ENCODE) { |
1258 | 0 | L_WARNING("pixs has cmap; using flate encoding\n", __func__); |
1259 | 0 | type = L_FLATE_ENCODE; |
1260 | 0 | } else if (d < 8 && (type == L_JPEG_ENCODE || type == L_JP2K_ENCODE)) { |
1261 | 0 | L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__); |
1262 | 0 | type = L_FLATE_ENCODE; |
1263 | 0 | } else if (d > 1 && type == L_G4_ENCODE) { |
1264 | 0 | L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__); |
1265 | 0 | type = L_FLATE_ENCODE; |
1266 | 0 | } |
1267 | |
|
1268 | 0 | if (type == L_JPEG_ENCODE) { |
1269 | 0 | if ((*pcid = pixGenerateJpegData(pixs, ascii85, quality)) == NULL) |
1270 | 0 | return ERROR_INT("jpeg data not made", __func__, 1); |
1271 | 0 | } else if (type == L_JP2K_ENCODE) { |
1272 | 0 | if ((*pcid = pixGenerateJp2kData(pixs, quality)) == NULL) |
1273 | 0 | return ERROR_INT("jp2k data not made", __func__, 1); |
1274 | 0 | } else if (type == L_G4_ENCODE) { |
1275 | 0 | if ((*pcid = pixGenerateG4Data(pixs, ascii85)) == NULL) |
1276 | 0 | return ERROR_INT("g4 data not made", __func__, 1); |
1277 | 0 | } else { /* type == L_FLATE_ENCODE */ |
1278 | 0 | if ((*pcid = pixGenerateFlateData(pixs, ascii85)) == NULL) |
1279 | 0 | return ERROR_INT("flate data not made", __func__, 1); |
1280 | 0 | } |
1281 | 0 | return 0; |
1282 | 0 | } |
1283 | | |
1284 | | |
1285 | | /*! |
1286 | | * \brief l_generateFlateData() |
1287 | | * |
1288 | | * \param[in] fname |
1289 | | * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped |
1290 | | * \return cid flate compressed image data, or NULL on error |
1291 | | * |
1292 | | * <pre> |
1293 | | * Notes: |
1294 | | * (1) The input image is converted to one of these 4 types: |
1295 | | * ~ 1 bpp |
1296 | | * ~ 8 bpp, no colormap |
1297 | | * ~ 8 bpp, colormap |
1298 | | * ~ 32 bpp rgb |
1299 | | * (2) Set ascii85flag: |
1300 | | * ~ 0 for binary data (PDF only) |
1301 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
1302 | | * (3) Always transcodes (i.e., first decodes the png file) |
1303 | | * </pre> |
1304 | | */ |
1305 | | L_COMP_DATA * |
1306 | | l_generateFlateData(const char *fname, |
1307 | | l_int32 ascii85flag) |
1308 | 0 | { |
1309 | 0 | L_COMP_DATA *cid; |
1310 | 0 | PIX *pixs; |
1311 | |
|
1312 | 0 | if (!fname) |
1313 | 0 | return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL); |
1314 | | |
1315 | 0 | if ((pixs = pixRead(fname)) == NULL) |
1316 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not made", __func__, NULL); |
1317 | 0 | cid = pixGenerateFlateData(pixs, ascii85flag); |
1318 | 0 | pixDestroy(&pixs); |
1319 | 0 | return cid; |
1320 | 0 | } |
1321 | | |
1322 | | |
1323 | | /*! |
1324 | | * \brief pixGenerateFlateData() |
1325 | | * |
1326 | | * \param[in] pixs |
1327 | | * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped |
1328 | | * \return cid flate compressed image data, or NULL on error |
1329 | | * |
1330 | | * <pre> |
1331 | | * Notes: |
1332 | | * (1) If called with an RGBA pix (spp == 4), the alpha channel |
1333 | | * will be removed, projecting a white backgrouond through |
1334 | | * any transparency. |
1335 | | * (2) If called with a colormapped pix, any transparency in the |
1336 | | * alpha component in the colormap will be ignored, as it is |
1337 | | * for all leptonica operations on colormapped pix. |
1338 | | * </pre> |
1339 | | */ |
1340 | | static L_COMP_DATA * |
1341 | | pixGenerateFlateData(PIX *pixs, |
1342 | | l_int32 ascii85flag) |
1343 | 0 | { |
1344 | 0 | l_uint8 *data = NULL; /* uncompressed raster data in required format */ |
1345 | 0 | l_uint8 *datacomp = NULL; /* gzipped raster data */ |
1346 | 0 | char *data85 = NULL; /* ascii85 encoded gzipped raster data */ |
1347 | 0 | l_uint8 *cmapdata = NULL; /* uncompressed colormap */ |
1348 | 0 | char *cmapdata85 = NULL; /* ascii85 encoded uncompressed colormap */ |
1349 | 0 | char *cmapdatahex = NULL; /* hex ascii uncompressed colormap */ |
1350 | 0 | l_int32 ncolors; /* in colormap; not used if cmapdata85 is null */ |
1351 | 0 | l_int32 bps; /* bits/sample: usually 8 */ |
1352 | 0 | l_int32 spp; /* samples/pixel: 1-grayscale/cmap); 3-rgb */ |
1353 | 0 | l_int32 w, h, d, cmapflag; |
1354 | 0 | size_t ncmapbytes85 = 0; |
1355 | 0 | size_t nbytes85 = 0; |
1356 | 0 | size_t nbytes, nbytescomp; |
1357 | 0 | L_COMP_DATA *cid; |
1358 | 0 | PIX *pixt; |
1359 | 0 | PIXCMAP *cmap; |
1360 | |
|
1361 | 0 | if (!pixs) |
1362 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); |
1363 | | |
1364 | | /* Convert the image to one of these 4 types: |
1365 | | * 1 bpp |
1366 | | * 8 bpp, no colormap |
1367 | | * 8 bpp, colormap |
1368 | | * 32 bpp rgb */ |
1369 | 0 | pixGetDimensions(pixs, &w, &h, &d); |
1370 | 0 | cmap = pixGetColormap(pixs); |
1371 | 0 | cmapflag = (cmap) ? 1 : 0; |
1372 | 0 | if (d == 2 || d == 4 || d == 16) { |
1373 | 0 | pixt = pixConvertTo8(pixs, cmapflag); |
1374 | 0 | cmap = pixGetColormap(pixt); |
1375 | 0 | d = pixGetDepth(pixt); |
1376 | 0 | } else if (d == 32 && pixGetSpp(pixs) == 4) { /* remove alpha */ |
1377 | 0 | pixt = pixAlphaBlendUniform(pixs, 0xffffff00); |
1378 | 0 | } else { |
1379 | 0 | pixt = pixClone(pixs); |
1380 | 0 | } |
1381 | 0 | if (!pixt) |
1382 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixt not made", __func__, NULL); |
1383 | 0 | spp = (d == 32) ? 3 : 1; |
1384 | 0 | bps = (d == 32) ? 8 : d; |
1385 | | |
1386 | | /* Extract and encode the colormap data as both ascii85 and hexascii */ |
1387 | 0 | ncolors = 0; |
1388 | 0 | if (cmap) { |
1389 | 0 | pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata); |
1390 | 0 | if (!cmapdata) { |
1391 | 0 | pixDestroy(&pixt); |
1392 | 0 | return (L_COMP_DATA *)ERROR_PTR("cmapdata not made", |
1393 | 0 | __func__, NULL); |
1394 | 0 | } |
1395 | | |
1396 | 0 | cmapdata85 = encodeAscii85(cmapdata, 3 * ncolors, &ncmapbytes85); |
1397 | 0 | cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors); |
1398 | 0 | LEPT_FREE(cmapdata); |
1399 | 0 | } |
1400 | | |
1401 | | /* Extract and compress the raster data */ |
1402 | 0 | pixGetRasterData(pixt, &data, &nbytes); |
1403 | 0 | pixDestroy(&pixt); |
1404 | 0 | if (!data) { |
1405 | 0 | LEPT_FREE(cmapdata85); |
1406 | 0 | LEPT_FREE(cmapdatahex); |
1407 | 0 | return (L_COMP_DATA *)ERROR_PTR("data not returned", __func__, NULL); |
1408 | 0 | } |
1409 | 0 | datacomp = zlibCompress(data, nbytes, &nbytescomp); |
1410 | 0 | LEPT_FREE(data); |
1411 | 0 | if (!datacomp) { |
1412 | 0 | LEPT_FREE(cmapdata85); |
1413 | 0 | LEPT_FREE(cmapdatahex); |
1414 | 0 | return (L_COMP_DATA *)ERROR_PTR("datacomp not made", __func__, NULL); |
1415 | 0 | } |
1416 | | |
1417 | | /* Optionally, encode the compressed data */ |
1418 | 0 | if (ascii85flag == 1) { |
1419 | 0 | data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85); |
1420 | 0 | LEPT_FREE(datacomp); |
1421 | 0 | if (!data85) { |
1422 | 0 | LEPT_FREE(cmapdata85); |
1423 | 0 | LEPT_FREE(cmapdatahex); |
1424 | 0 | return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL); |
1425 | 0 | } else { |
1426 | 0 | data85[nbytes85 - 1] = '\0'; /* remove the newline */ |
1427 | 0 | } |
1428 | 0 | } |
1429 | | |
1430 | 0 | cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA)); |
1431 | 0 | if (ascii85flag == 0) { |
1432 | 0 | cid->datacomp = datacomp; |
1433 | 0 | } else { /* ascii85 */ |
1434 | 0 | cid->data85 = data85; |
1435 | 0 | cid->nbytes85 = nbytes85; |
1436 | 0 | } |
1437 | 0 | cid->type = L_FLATE_ENCODE; |
1438 | 0 | cid->cmapdatahex = cmapdatahex; |
1439 | 0 | cid->cmapdata85 = cmapdata85; |
1440 | 0 | cid->nbytescomp = nbytescomp; |
1441 | 0 | cid->ncolors = ncolors; |
1442 | 0 | cid->w = w; |
1443 | 0 | cid->h = h; |
1444 | 0 | cid->bps = bps; |
1445 | 0 | cid->spp = spp; |
1446 | 0 | cid->res = pixGetXRes(pixs); |
1447 | 0 | cid->nbytes = nbytes; /* only for debugging */ |
1448 | 0 | return cid; |
1449 | 0 | } |
1450 | | |
1451 | | |
1452 | | /*! |
1453 | | * \brief pixGenerateJpegData() |
1454 | | * |
1455 | | * \param[in] pixs 8, 16 or 32 bpp, no colormap |
1456 | | * \param[in] ascii85flag 0 for jpeg; 1 for ascii85-encoded jpeg |
1457 | | * \param[in] quality 0 for default, which is 75 |
1458 | | * \return cid jpeg compressed data, or NULL on error |
1459 | | * |
1460 | | * <pre> |
1461 | | * Notes: |
1462 | | * (1) Set ascii85flag: |
1463 | | * ~ 0 for binary data (PDF only) |
1464 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
1465 | | * (2) If 16 bpp, convert first to 8 bpp, using the MSB |
1466 | | * </pre> |
1467 | | */ |
1468 | | static L_COMP_DATA * |
1469 | | pixGenerateJpegData(PIX *pixs, |
1470 | | l_int32 ascii85flag, |
1471 | | l_int32 quality) |
1472 | 0 | { |
1473 | 0 | l_int32 d; |
1474 | 0 | char *fname; |
1475 | 0 | L_COMP_DATA *cid; |
1476 | |
|
1477 | 0 | if (!pixs) |
1478 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); |
1479 | 0 | if (pixGetColormap(pixs)) |
1480 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL); |
1481 | 0 | d = pixGetDepth(pixs); |
1482 | 0 | if (d != 8 && d != 16 && d != 32) |
1483 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not 8, 16 or 32 bpp", |
1484 | 0 | __func__, NULL); |
1485 | | |
1486 | | /* Compress to a temp jpeg file */ |
1487 | 0 | fname = l_makeTempFilename(); |
1488 | 0 | if (pixWriteJpeg(fname, pixs, quality, 0)) { |
1489 | 0 | LEPT_FREE(fname); |
1490 | 0 | return NULL; |
1491 | 0 | } |
1492 | | |
1493 | | /* Generate the data */ |
1494 | 0 | cid = l_generateJpegData(fname, ascii85flag); |
1495 | 0 | if (lept_rmfile(fname) != 0) |
1496 | 0 | L_ERROR("temp file %s was not deleted\n", __func__, fname); |
1497 | 0 | LEPT_FREE(fname); |
1498 | 0 | return cid; |
1499 | 0 | } |
1500 | | |
1501 | | |
1502 | | /*! |
1503 | | * \brief pixGenerateJp2kData() |
1504 | | * |
1505 | | * \param[in] pixs 8 or 32 bpp, no colormap |
1506 | | * \param[in] quality 0 for default, which is 34 |
1507 | | * \return cid jp2k compressed data, or NULL on error |
1508 | | * |
1509 | | * <pre> |
1510 | | * Notes: |
1511 | | * (1) The quality can be set between 27 (very poor) and 45 |
1512 | | * (nearly perfect). Use 0 for default (34). Use 100 for lossless, |
1513 | | * but this is very expensive and not recommended. |
1514 | | * </pre> |
1515 | | */ |
1516 | | static L_COMP_DATA * |
1517 | | pixGenerateJp2kData(PIX *pixs, |
1518 | | l_int32 quality) |
1519 | 0 | { |
1520 | 0 | l_int32 d; |
1521 | 0 | char *fname; |
1522 | 0 | L_COMP_DATA *cid; |
1523 | |
|
1524 | 0 | if (!pixs) |
1525 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); |
1526 | 0 | if (pixGetColormap(pixs)) |
1527 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL); |
1528 | 0 | d = pixGetDepth(pixs); |
1529 | 0 | if (d != 8 && d != 32) |
1530 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", __func__, NULL); |
1531 | | |
1532 | | /* Compress to a temp jp2k file */ |
1533 | 0 | fname = l_makeTempFilename(); |
1534 | 0 | if (pixWriteJp2k(fname, pixs, quality, 5, 0, 0)) { |
1535 | 0 | LEPT_FREE(fname); |
1536 | 0 | return NULL; |
1537 | 0 | } |
1538 | | |
1539 | | /* Generate the data */ |
1540 | 0 | cid = l_generateJp2kData(fname); |
1541 | 0 | if (lept_rmfile(fname) != 0) |
1542 | 0 | L_ERROR("temp file %s was not deleted\n", __func__, fname); |
1543 | 0 | LEPT_FREE(fname); |
1544 | 0 | return cid; |
1545 | 0 | } |
1546 | | |
1547 | | |
1548 | | /*! |
1549 | | * \brief pixGenerateG4Data() |
1550 | | * |
1551 | | * \param[in] pixs 1 bpp, no colormap |
1552 | | * \param[in] ascii85flag 0 for gzipped; 1 for ascii85-encoded gzipped |
1553 | | * \return cid g4 compressed image data, or NULL on error |
1554 | | * |
1555 | | * <pre> |
1556 | | * Notes: |
1557 | | * (1) Set ascii85flag: |
1558 | | * ~ 0 for binary data (PDF only) |
1559 | | * ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only) |
1560 | | * </pre> |
1561 | | */ |
1562 | | static L_COMP_DATA * |
1563 | | pixGenerateG4Data(PIX *pixs, |
1564 | | l_int32 ascii85flag) |
1565 | 0 | { |
1566 | 0 | char *fname; |
1567 | 0 | L_COMP_DATA *cid; |
1568 | |
|
1569 | 0 | if (!pixs) |
1570 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL); |
1571 | 0 | if (pixGetDepth(pixs) != 1) |
1572 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs not 1 bpp", __func__, NULL); |
1573 | 0 | if (pixGetColormap(pixs)) |
1574 | 0 | return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL); |
1575 | | |
1576 | | /* Compress to a temp tiff g4 file */ |
1577 | 0 | fname = l_makeTempFilename(); |
1578 | 0 | if (pixWrite(fname, pixs, IFF_TIFF_G4)) { |
1579 | 0 | LEPT_FREE(fname); |
1580 | 0 | return NULL; |
1581 | 0 | } |
1582 | | |
1583 | 0 | cid = l_generateG4Data(fname, ascii85flag); |
1584 | 0 | if (lept_rmfile(fname) != 0) |
1585 | 0 | L_ERROR("temp file %s was not deleted\n", __func__, fname); |
1586 | 0 | LEPT_FREE(fname); |
1587 | 0 | return cid; |
1588 | 0 | } |
1589 | | |
1590 | | |
1591 | | /*! |
1592 | | * \brief cidConvertToPdfData() |
1593 | | * |
1594 | | * \param[in] cid compressed image data |
1595 | | * \param[in] title [optional] pdf title; can be null |
1596 | | * \param[out] pdata output pdf data for image |
1597 | | * \param[out] pnbytes size of output pdf data |
1598 | | * \return 0 if OK, 1 on error |
1599 | | * |
1600 | | * <pre> |
1601 | | * Notes: |
1602 | | * (1) Caller must not destroy the cid. It is absorbed in the |
1603 | | * lpd and destroyed by this function. |
1604 | | * </pre> |
1605 | | */ |
1606 | | l_ok |
1607 | | cidConvertToPdfData(L_COMP_DATA *cid, |
1608 | | const char *title, |
1609 | | l_uint8 **pdata, |
1610 | | size_t *pnbytes) |
1611 | 0 | { |
1612 | 0 | l_int32 res, ret; |
1613 | 0 | l_float32 wpt, hpt; |
1614 | 0 | L_PDF_DATA *lpd = NULL; |
1615 | |
|
1616 | 0 | if (!pdata || !pnbytes) |
1617 | 0 | return ERROR_INT("&data and &nbytes not both defined", __func__, 1); |
1618 | 0 | *pdata = NULL; |
1619 | 0 | *pnbytes = 0; |
1620 | 0 | if (!cid) |
1621 | 0 | return ERROR_INT("cid not defined", __func__, 1); |
1622 | | |
1623 | | /* Get media box parameters, in pts */ |
1624 | 0 | res = cid->res; |
1625 | 0 | if (res <= 0) |
1626 | 0 | res = DefaultInputRes; |
1627 | 0 | wpt = cid->w * 72.f / res; |
1628 | 0 | hpt = cid->h * 72.f / res; |
1629 | | |
1630 | | /* Set up the pdf data struct (lpd) */ |
1631 | 0 | if ((lpd = pdfdataCreate(title)) == NULL) |
1632 | 0 | return ERROR_INT("lpd not made", __func__, 1); |
1633 | 0 | ptraAdd(lpd->cida, cid); |
1634 | 0 | lpd->n++; |
1635 | 0 | ptaAddPt(lpd->xy, 0, 0); /* xpt = ypt = 0 */ |
1636 | 0 | ptaAddPt(lpd->wh, wpt, hpt); |
1637 | | |
1638 | | /* Generate the pdf string and destroy the lpd */ |
1639 | 0 | ret = l_generatePdf(pdata, pnbytes, lpd); |
1640 | 0 | pdfdataDestroy(&lpd); |
1641 | 0 | if (ret) |
1642 | 0 | return ERROR_INT("pdf output not made", __func__, 1); |
1643 | 0 | return 0; |
1644 | 0 | } |
1645 | | |
1646 | | |
1647 | | /*! |
1648 | | * \brief l_CIDataDestroy() |
1649 | | * |
1650 | | * \param[in,out] pcid will be set to null before returning |
1651 | | * \return void |
1652 | | */ |
1653 | | void |
1654 | | l_CIDataDestroy(L_COMP_DATA **pcid) |
1655 | 0 | { |
1656 | 0 | L_COMP_DATA *cid; |
1657 | |
|
1658 | 0 | if (pcid == NULL) { |
1659 | 0 | L_WARNING("ptr address is null!\n", __func__); |
1660 | 0 | return; |
1661 | 0 | } |
1662 | 0 | if ((cid = *pcid) == NULL) |
1663 | 0 | return; |
1664 | | |
1665 | 0 | if (cid->datacomp) LEPT_FREE(cid->datacomp); |
1666 | 0 | if (cid->data85) LEPT_FREE(cid->data85); |
1667 | 0 | if (cid->cmapdata85) LEPT_FREE(cid->cmapdata85); |
1668 | 0 | if (cid->cmapdatahex) LEPT_FREE(cid->cmapdatahex); |
1669 | 0 | LEPT_FREE(cid); |
1670 | 0 | *pcid = NULL; |
1671 | 0 | } |
1672 | | |
1673 | | |
1674 | | /*---------------------------------------------------------------------* |
1675 | | * Helper functions for generating the output pdf string * |
1676 | | *---------------------------------------------------------------------*/ |
1677 | | /*! |
1678 | | * \brief l_generatePdf() |
1679 | | * |
1680 | | * \param[out] pdata pdf array |
1681 | | * \param[out] pnbytes number of bytes in pdf array |
1682 | | * \param[in] lpd all the required input image data |
1683 | | * \return 0 if OK, 1 on error |
1684 | | * |
1685 | | * <pre> |
1686 | | * Notes: |
1687 | | * (1) On error, no data is returned. |
1688 | | * (2) The objects are: |
1689 | | * 1: Catalog |
1690 | | * 2: Info |
1691 | | * 3: Pages |
1692 | | * 4: Page |
1693 | | * 5: Contents (rendering command) |
1694 | | * 6 to 6+n-1: n XObjects |
1695 | | * 6+n to 6+n+m-1: m colormaps |
1696 | | * </pre> |
1697 | | */ |
1698 | | static l_int32 |
1699 | | l_generatePdf(l_uint8 **pdata, |
1700 | | size_t *pnbytes, |
1701 | | L_PDF_DATA *lpd) |
1702 | 0 | { |
1703 | 0 | if (!pdata) |
1704 | 0 | return ERROR_INT("&data not defined", __func__, 1); |
1705 | 0 | *pdata = NULL; |
1706 | 0 | if (!pnbytes) |
1707 | 0 | return ERROR_INT("&nbytes not defined", __func__, 1); |
1708 | 0 | *pnbytes = 0; |
1709 | 0 | if (!lpd) |
1710 | 0 | return ERROR_INT("lpd not defined", __func__, 1); |
1711 | | |
1712 | 0 | generateFixedStringsPdf(lpd); |
1713 | 0 | generateMediaboxPdf(lpd); |
1714 | 0 | generatePageStringPdf(lpd); |
1715 | 0 | generateContentStringPdf(lpd); |
1716 | 0 | generatePreXStringsPdf(lpd); |
1717 | 0 | generateColormapStringsPdf(lpd); |
1718 | 0 | generateTrailerPdf(lpd); |
1719 | 0 | return generateOutputDataPdf(pdata, pnbytes, lpd); |
1720 | 0 | } |
1721 | | |
1722 | | |
1723 | | static void |
1724 | | generateFixedStringsPdf(L_PDF_DATA *lpd) |
1725 | 0 | { |
1726 | 0 | char buf[L_SMALLBUF]; |
1727 | 0 | char *version, *datestr; |
1728 | 0 | SARRAY *sa; |
1729 | | |
1730 | | /* Accumulate data for the header and objects 1-3 */ |
1731 | 0 | lpd->id = stringNew("%PDF-1.5\n"); |
1732 | 0 | l_dnaAddNumber(lpd->objsize, strlen(lpd->id)); |
1733 | |
|
1734 | 0 | lpd->obj1 = stringNew("1 0 obj\n" |
1735 | 0 | "<<\n" |
1736 | 0 | "/Type /Catalog\n" |
1737 | 0 | "/Pages 3 0 R\n" |
1738 | 0 | ">>\n" |
1739 | 0 | "endobj\n"); |
1740 | 0 | l_dnaAddNumber(lpd->objsize, strlen(lpd->obj1)); |
1741 | |
|
1742 | 0 | sa = sarrayCreate(0); |
1743 | 0 | sarrayAddString(sa, "2 0 obj\n" |
1744 | 0 | "<<\n", L_COPY); |
1745 | 0 | if (var_WRITE_DATE_AND_VERSION) { |
1746 | 0 | datestr = l_getFormattedDate(); |
1747 | 0 | snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr); |
1748 | 0 | sarrayAddString(sa, buf, L_COPY); |
1749 | 0 | LEPT_FREE(datestr); |
1750 | 0 | version = getLeptonicaVersion(); |
1751 | 0 | snprintf(buf, sizeof(buf), |
1752 | 0 | "/Producer (leptonica: %s)\n", version); |
1753 | 0 | LEPT_FREE(version); |
1754 | 0 | } else { |
1755 | 0 | snprintf(buf, sizeof(buf), "/Producer (leptonica)\n"); |
1756 | 0 | } |
1757 | 0 | sarrayAddString(sa, buf, L_COPY); |
1758 | 0 | if (lpd->title) { |
1759 | 0 | char *hexstr; |
1760 | 0 | if ((hexstr = generateEscapeString(lpd->title)) != NULL) { |
1761 | 0 | snprintf(buf, sizeof(buf), "/Title %s\n", hexstr); |
1762 | 0 | sarrayAddString(sa, buf, L_COPY); |
1763 | 0 | } else { |
1764 | 0 | L_ERROR("title string is not ascii\n", __func__); |
1765 | 0 | } |
1766 | 0 | LEPT_FREE(hexstr); |
1767 | 0 | } |
1768 | 0 | sarrayAddString(sa, ">>\n" |
1769 | 0 | "endobj\n", L_COPY); |
1770 | 0 | lpd->obj2 = sarrayToString(sa, 0); |
1771 | 0 | l_dnaAddNumber(lpd->objsize, strlen(lpd->obj2)); |
1772 | 0 | sarrayDestroy(&sa); |
1773 | |
|
1774 | 0 | lpd->obj3 = stringNew("3 0 obj\n" |
1775 | 0 | "<<\n" |
1776 | 0 | "/Type /Pages\n" |
1777 | 0 | "/Kids [ 4 0 R ]\n" |
1778 | 0 | "/Count 1\n" |
1779 | 0 | ">>\n"); |
1780 | 0 | l_dnaAddNumber(lpd->objsize, strlen(lpd->obj3)); |
1781 | | |
1782 | | /* Do the post-datastream string */ |
1783 | 0 | lpd->poststream = stringNew("\n" |
1784 | 0 | "endstream\n" |
1785 | 0 | "endobj\n"); |
1786 | 0 | } |
1787 | | |
1788 | | |
1789 | | /*! |
1790 | | * \brief generateEscapeString() |
1791 | | * |
1792 | | * \param[in] str input string |
1793 | | * \return hex escape string, or null on error |
1794 | | * |
1795 | | * <pre> |
1796 | | * Notes: |
1797 | | * (1) If the input string is not ascii, returns null. |
1798 | | * (2) This takes an input ascii string and generates a hex |
1799 | | * ascii output string with 4 bytes out for each byte in. |
1800 | | * The feff code at the beginning tells the pdf interpreter |
1801 | | * that the data is to be interpreted as big-endian, 4 bytes |
1802 | | * at a time. For ascii, the first two bytes are 0 and the |
1803 | | * last two bytes are less than 0x80. |
1804 | | * </pre> |
1805 | | */ |
1806 | | static char * |
1807 | | generateEscapeString(const char *str) |
1808 | 0 | { |
1809 | 0 | char smallbuf[8]; |
1810 | 0 | char *buffer; |
1811 | 0 | l_int32 i, nchar, buflen; |
1812 | |
|
1813 | 0 | if (!str) |
1814 | 0 | return (char *)ERROR_PTR("str not defined", __func__, NULL); |
1815 | 0 | nchar = strlen(str); |
1816 | 0 | for (i = 0; i < nchar; i++) { |
1817 | 0 | if (str[i] < 0) |
1818 | 0 | return (char *)ERROR_PTR("str not all ascii", __func__, NULL); |
1819 | 0 | } |
1820 | | |
1821 | 0 | buflen = 4 * nchar + 10; |
1822 | 0 | buffer = (char *)LEPT_CALLOC(buflen, sizeof(char)); |
1823 | 0 | stringCat(buffer, buflen, "<feff"); |
1824 | 0 | for (i = 0; i < nchar; i++) { |
1825 | 0 | snprintf(smallbuf, sizeof(smallbuf), "%04x", str[i]); |
1826 | 0 | stringCat(buffer, buflen, smallbuf); |
1827 | 0 | } |
1828 | 0 | stringCat(buffer, buflen, ">"); |
1829 | 0 | return buffer; |
1830 | 0 | } |
1831 | | |
1832 | | |
1833 | | static void |
1834 | | generateMediaboxPdf(L_PDF_DATA *lpd) |
1835 | 0 | { |
1836 | 0 | l_int32 i; |
1837 | 0 | l_float32 xpt, ypt, wpt, hpt, maxx, maxy; |
1838 | | |
1839 | | /* First get the full extent of all the images. |
1840 | | * This is the mediabox, in pts. */ |
1841 | 0 | maxx = maxy = 0; |
1842 | 0 | for (i = 0; i < lpd->n; i++) { |
1843 | 0 | ptaGetPt(lpd->xy, i, &xpt, &ypt); |
1844 | 0 | ptaGetPt(lpd->wh, i, &wpt, &hpt); |
1845 | 0 | maxx = L_MAX(maxx, xpt + wpt); |
1846 | 0 | maxy = L_MAX(maxy, ypt + hpt); |
1847 | 0 | } |
1848 | |
|
1849 | 0 | lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5), |
1850 | 0 | (l_int32)(maxy + 0.5)); |
1851 | | |
1852 | | /* ypt is in standard image coordinates: the location of |
1853 | | * the UL image corner with respect to the UL media box corner. |
1854 | | * Rewrite each ypt for PostScript coordinates: the location of |
1855 | | * the LL image corner with respect to the LL media box corner. */ |
1856 | 0 | for (i = 0; i < lpd->n; i++) { |
1857 | 0 | ptaGetPt(lpd->xy, i, &xpt, &ypt); |
1858 | 0 | ptaGetPt(lpd->wh, i, &wpt, &hpt); |
1859 | 0 | ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt); |
1860 | 0 | } |
1861 | 0 | } |
1862 | | |
1863 | | |
1864 | | static l_int32 |
1865 | | generatePageStringPdf(L_PDF_DATA *lpd) |
1866 | 0 | { |
1867 | 0 | char *buf; |
1868 | 0 | char *xstr; |
1869 | 0 | l_int32 bufsize, i, wpt, hpt; |
1870 | 0 | SARRAY *sa; |
1871 | | |
1872 | | /* Allocate 1000 bytes for the boilerplate text, and |
1873 | | * 50 bytes for each reference to an image in the |
1874 | | * ProcSet array. */ |
1875 | 0 | bufsize = 1000 + 50 * lpd->n; |
1876 | 0 | if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL) |
1877 | 0 | return ERROR_INT("calloc fail for buf", __func__, 1); |
1878 | | |
1879 | 0 | boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt); |
1880 | 0 | sa = sarrayCreate(lpd->n); |
1881 | 0 | for (i = 0; i < lpd->n; i++) { |
1882 | 0 | snprintf(buf, bufsize, "/Im%d %d 0 R ", i + 1, 6 + i); |
1883 | 0 | sarrayAddString(sa, buf, L_COPY); |
1884 | 0 | } |
1885 | 0 | xstr = sarrayToString(sa, 0); |
1886 | 0 | sarrayDestroy(&sa); |
1887 | 0 | if (!xstr) { |
1888 | 0 | LEPT_FREE(buf); |
1889 | 0 | return ERROR_INT("xstr not made", __func__, 1); |
1890 | 0 | } |
1891 | | |
1892 | 0 | snprintf(buf, bufsize, "4 0 obj\n" |
1893 | 0 | "<<\n" |
1894 | 0 | "/Type /Page\n" |
1895 | 0 | "/Parent 3 0 R\n" |
1896 | 0 | "/MediaBox [%d %d %d %d]\n" |
1897 | 0 | "/Contents 5 0 R\n" |
1898 | 0 | "/Resources\n" |
1899 | 0 | "<<\n" |
1900 | 0 | "/XObject << %s >>\n" |
1901 | 0 | "/ProcSet [ /ImageB /ImageI /ImageC ]\n" |
1902 | 0 | ">>\n" |
1903 | 0 | ">>\n" |
1904 | 0 | "endobj\n", |
1905 | 0 | 0, 0, wpt, hpt, xstr); |
1906 | |
|
1907 | 0 | lpd->obj4 = stringNew(buf); |
1908 | 0 | l_dnaAddNumber(lpd->objsize, strlen(lpd->obj4)); |
1909 | 0 | sarrayDestroy(&sa); |
1910 | 0 | LEPT_FREE(buf); |
1911 | 0 | LEPT_FREE(xstr); |
1912 | 0 | return 0; |
1913 | 0 | } |
1914 | | |
1915 | | |
1916 | | static l_int32 |
1917 | | generateContentStringPdf(L_PDF_DATA *lpd) |
1918 | 0 | { |
1919 | 0 | char *buf; |
1920 | 0 | char *cstr; |
1921 | 0 | l_int32 i, bufsize; |
1922 | 0 | l_float32 xpt, ypt, wpt, hpt; |
1923 | 0 | SARRAY *sa; |
1924 | |
|
1925 | 0 | bufsize = 1000 + 200 * lpd->n; |
1926 | 0 | if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL) |
1927 | 0 | return ERROR_INT("calloc fail for buf", __func__, 1); |
1928 | | |
1929 | 0 | sa = sarrayCreate(lpd->n); |
1930 | 0 | for (i = 0; i < lpd->n; i++) { |
1931 | 0 | ptaGetPt(lpd->xy, i, &xpt, &ypt); |
1932 | 0 | ptaGetPt(lpd->wh, i, &wpt, &hpt); |
1933 | 0 | snprintf(buf, bufsize, |
1934 | 0 | "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n", |
1935 | 0 | wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1); |
1936 | 0 | sarrayAddString(sa, buf, L_COPY); |
1937 | 0 | } |
1938 | 0 | cstr = sarrayToString(sa, 0); |
1939 | 0 | sarrayDestroy(&sa); |
1940 | 0 | if (!cstr) { |
1941 | 0 | LEPT_FREE(buf); |
1942 | 0 | return ERROR_INT("cstr not made", __func__, 1); |
1943 | 0 | } |
1944 | | |
1945 | 0 | snprintf(buf, bufsize, "5 0 obj\n" |
1946 | 0 | "<< /Length %d >>\n" |
1947 | 0 | "stream\n" |
1948 | 0 | "%s" |
1949 | 0 | "endstream\n" |
1950 | 0 | "endobj\n", |
1951 | 0 | (l_int32)strlen(cstr), cstr); |
1952 | |
|
1953 | 0 | lpd->obj5 = stringNew(buf); |
1954 | 0 | l_dnaAddNumber(lpd->objsize, strlen(lpd->obj5)); |
1955 | 0 | sarrayDestroy(&sa); |
1956 | 0 | LEPT_FREE(buf); |
1957 | 0 | LEPT_FREE(cstr); |
1958 | 0 | return 0; |
1959 | 0 | } |
1960 | | |
1961 | | |
1962 | | static l_int32 |
1963 | | generatePreXStringsPdf(L_PDF_DATA *lpd) |
1964 | 0 | { |
1965 | 0 | char buff[256]; |
1966 | 0 | char buf[L_BIGBUF]; |
1967 | 0 | char *cstr, *bstr, *fstr, *pstr, *xstr, *photometry; |
1968 | 0 | l_int32 i, cmindex; |
1969 | 0 | L_COMP_DATA *cid; |
1970 | 0 | SARRAY *sa; |
1971 | |
|
1972 | 0 | sa = lpd->saprex; |
1973 | 0 | cmindex = 6 + lpd->n; /* starting value */ |
1974 | 0 | for (i = 0; i < lpd->n; i++) { |
1975 | 0 | pstr = cstr = NULL; |
1976 | 0 | if ((cid = pdfdataGetCid(lpd, i)) == NULL) |
1977 | 0 | return ERROR_INT("cid not found", __func__, 1); |
1978 | | |
1979 | 0 | if (cid->type == L_G4_ENCODE) { |
1980 | 0 | if (var_WRITE_G4_IMAGE_MASK) { |
1981 | 0 | cstr = stringNew("/ImageMask true\n" |
1982 | 0 | "/ColorSpace /DeviceGray"); |
1983 | 0 | } else { |
1984 | 0 | cstr = stringNew("/ColorSpace /DeviceGray"); |
1985 | 0 | } |
1986 | 0 | bstr = stringNew("/BitsPerComponent 1\n" |
1987 | 0 | "/Interpolate true"); |
1988 | | /* Note: the reversal is deliberate. The BlackIs1 flag |
1989 | | * is misleadingly named: it says whether to invert the |
1990 | | * image on decoding because the black pixels are 0, |
1991 | | * not whether the black pixels are 1! The default for |
1992 | | * BlackIs1 is "false", which means "don't invert because |
1993 | | * black is 1." Yikes. */ |
1994 | 0 | photometry = (cid->minisblack) ? stringNew("true") |
1995 | 0 | : stringNew("false"); |
1996 | 0 | snprintf(buff, sizeof(buff), |
1997 | 0 | "/Filter /CCITTFaxDecode\n" |
1998 | 0 | "/DecodeParms\n" |
1999 | 0 | "<<\n" |
2000 | 0 | "/BlackIs1 %s\n" |
2001 | 0 | "/K -1\n" |
2002 | 0 | "/Columns %d\n" |
2003 | 0 | ">>", photometry, cid->w); |
2004 | 0 | fstr = stringNew(buff); |
2005 | 0 | LEPT_FREE(photometry); |
2006 | 0 | } else if (cid->type == L_JPEG_ENCODE) { |
2007 | 0 | if (cid->spp == 1) |
2008 | 0 | cstr = stringNew("/ColorSpace /DeviceGray"); |
2009 | 0 | else if (cid->spp == 3) |
2010 | 0 | cstr = stringNew("/ColorSpace /DeviceRGB"); |
2011 | 0 | else if (cid->spp == 4) /* pdf supports cmyk */ |
2012 | 0 | cstr = stringNew("/ColorSpace /DeviceCMYK"); |
2013 | 0 | else |
2014 | 0 | L_ERROR("in jpeg: spp != 1, 3 or 4\n", __func__); |
2015 | 0 | bstr = stringNew("/BitsPerComponent 8"); |
2016 | 0 | fstr = stringNew("/Filter /DCTDecode"); |
2017 | 0 | } else if (cid->type == L_JP2K_ENCODE) { |
2018 | 0 | if (cid->spp == 1) |
2019 | 0 | cstr = stringNew("/ColorSpace /DeviceGray"); |
2020 | 0 | else if (cid->spp == 3) |
2021 | 0 | cstr = stringNew("/ColorSpace /DeviceRGB"); |
2022 | 0 | else |
2023 | 0 | L_ERROR("in jp2k: spp != 1 && spp != 3\n", __func__); |
2024 | 0 | bstr = stringNew("/BitsPerComponent 8"); |
2025 | 0 | fstr = stringNew("/Filter /JPXDecode"); |
2026 | 0 | } else { /* type == L_FLATE_ENCODE */ |
2027 | 0 | if (cid->ncolors > 0) { /* cmapped */ |
2028 | 0 | snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++); |
2029 | 0 | cstr = stringNew(buff); |
2030 | 0 | } else { |
2031 | 0 | if (cid->spp == 1 && cid->bps == 1) |
2032 | 0 | cstr = stringNew("/ColorSpace /DeviceGray\n" |
2033 | 0 | "/Decode [1 0]"); |
2034 | 0 | else if (cid->spp == 1) /* 8 bpp */ |
2035 | 0 | cstr = stringNew("/ColorSpace /DeviceGray"); |
2036 | 0 | else if (cid->spp == 3) |
2037 | 0 | cstr = stringNew("/ColorSpace /DeviceRGB"); |
2038 | 0 | else |
2039 | 0 | L_ERROR("unknown colorspace: spp = %d\n", |
2040 | 0 | __func__, cid->spp); |
2041 | 0 | } |
2042 | 0 | snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps); |
2043 | 0 | bstr = stringNew(buff); |
2044 | 0 | fstr = stringNew("/Filter /FlateDecode"); |
2045 | 0 | if (cid->predictor == TRUE) { |
2046 | 0 | snprintf(buff, sizeof(buff), |
2047 | 0 | "/DecodeParms\n" |
2048 | 0 | "<<\n" |
2049 | 0 | " /Columns %d\n" |
2050 | 0 | " /Predictor 14\n" |
2051 | 0 | " /Colors %d\n" |
2052 | 0 | " /BitsPerComponent %d\n" |
2053 | 0 | ">>\n", cid->w, cid->spp, cid->bps); |
2054 | 0 | pstr = stringNew(buff); |
2055 | 0 | } |
2056 | 0 | } |
2057 | 0 | if (!pstr) /* no decode parameters */ |
2058 | 0 | pstr = stringNew(""); |
2059 | |
|
2060 | 0 | snprintf(buf, sizeof(buf), |
2061 | 0 | "%d 0 obj\n" |
2062 | 0 | "<<\n" |
2063 | 0 | "/Length %zu\n" |
2064 | 0 | "/Subtype /Image\n" |
2065 | 0 | "%s\n" /* colorspace */ |
2066 | 0 | "/Width %d\n" |
2067 | 0 | "/Height %d\n" |
2068 | 0 | "%s\n" /* bits/component */ |
2069 | 0 | "%s\n" /* filter */ |
2070 | 0 | "%s" /* decode parms; can be empty */ |
2071 | 0 | ">>\n" |
2072 | 0 | "stream\n", |
2073 | 0 | 6 + i, cid->nbytescomp, cstr, |
2074 | 0 | cid->w, cid->h, bstr, fstr, pstr); |
2075 | 0 | xstr = stringNew(buf); |
2076 | 0 | sarrayAddString(sa, xstr, L_INSERT); |
2077 | 0 | l_dnaAddNumber(lpd->objsize, |
2078 | 0 | strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream)); |
2079 | 0 | LEPT_FREE(cstr); |
2080 | 0 | LEPT_FREE(bstr); |
2081 | 0 | LEPT_FREE(fstr); |
2082 | 0 | LEPT_FREE(pstr); |
2083 | 0 | } |
2084 | | |
2085 | 0 | return 0; |
2086 | 0 | } |
2087 | | |
2088 | | |
2089 | | static l_int32 |
2090 | | generateColormapStringsPdf(L_PDF_DATA *lpd) |
2091 | 0 | { |
2092 | 0 | char buf[L_BIGBUF]; |
2093 | 0 | char *cmstr; |
2094 | 0 | l_int32 i, cmindex, ncmap; |
2095 | 0 | L_COMP_DATA *cid; |
2096 | 0 | SARRAY *sa; |
2097 | | |
2098 | | /* In our canonical format, we have 5 objects, followed |
2099 | | * by n XObjects, followed by m colormaps, so the index of |
2100 | | * the first colormap object is 6 + n. */ |
2101 | 0 | sa = lpd->sacmap; |
2102 | 0 | cmindex = 6 + lpd->n; /* starting value */ |
2103 | 0 | ncmap = 0; |
2104 | 0 | for (i = 0; i < lpd->n; i++) { |
2105 | 0 | if ((cid = pdfdataGetCid(lpd, i)) == NULL) |
2106 | 0 | return ERROR_INT("cid not found", __func__, 1); |
2107 | 0 | if (cid->ncolors == 0) continue; |
2108 | | |
2109 | 0 | ncmap++; |
2110 | 0 | snprintf(buf, sizeof(buf), "%d 0 obj\n" |
2111 | 0 | "[ /Indexed /DeviceRGB\n" |
2112 | 0 | "%d\n" |
2113 | 0 | "%s\n" |
2114 | 0 | "]\n" |
2115 | 0 | "endobj\n", |
2116 | 0 | cmindex, cid->ncolors - 1, cid->cmapdatahex); |
2117 | 0 | cmindex++; |
2118 | 0 | cmstr = stringNew(buf); |
2119 | 0 | l_dnaAddNumber(lpd->objsize, strlen(cmstr)); |
2120 | 0 | sarrayAddString(sa, cmstr, L_INSERT); |
2121 | 0 | } |
2122 | | |
2123 | 0 | lpd->ncmap = ncmap; |
2124 | 0 | return 0; |
2125 | 0 | } |
2126 | | |
2127 | | |
2128 | | static void |
2129 | | generateTrailerPdf(L_PDF_DATA *lpd) |
2130 | 0 | { |
2131 | 0 | l_int32 i, n, size, linestart; |
2132 | 0 | L_DNA *daloc, *dasize; |
2133 | | |
2134 | | /* Let nobj be the number of numbered objects. These numbered |
2135 | | * objects are indexed by their pdf number in arrays naloc[] |
2136 | | * and nasize[]. The 0th object is the 9 byte header. Then |
2137 | | * the number of objects in nasize, which includes the header, |
2138 | | * is n = nobj + 1. The array naloc[] has n + 1 elements, |
2139 | | * because it includes as the last element the starting |
2140 | | * location of xref. The indexing of these objects, their |
2141 | | * starting locations and sizes are: |
2142 | | * |
2143 | | * Object number Starting location Size |
2144 | | * ------------- ----------------- -------------- |
2145 | | * 0 daloc[0] = 0 dasize[0] = 9 |
2146 | | * 1 daloc[1] = 9 dasize[1] = 49 |
2147 | | * n daloc[n] dasize[n] |
2148 | | * xref daloc[n+1] |
2149 | | * |
2150 | | * We first generate daloc. |
2151 | | */ |
2152 | 0 | dasize = lpd->objsize; |
2153 | 0 | daloc = lpd->objloc; |
2154 | 0 | linestart = 0; |
2155 | 0 | l_dnaAddNumber(daloc, linestart); /* header */ |
2156 | 0 | n = l_dnaGetCount(dasize); |
2157 | 0 | for (i = 0; i < n; i++) { |
2158 | 0 | l_dnaGetIValue(dasize, i, &size); |
2159 | 0 | linestart += size; |
2160 | 0 | l_dnaAddNumber(daloc, linestart); |
2161 | 0 | } |
2162 | 0 | l_dnaGetIValue(daloc, n, &lpd->xrefloc); /* save it */ |
2163 | | |
2164 | | /* Now make the actual trailer string */ |
2165 | 0 | lpd->trailer = makeTrailerStringPdf(daloc); |
2166 | 0 | } |
2167 | | |
2168 | | |
2169 | | static char * |
2170 | | makeTrailerStringPdf(L_DNA *daloc) |
2171 | 0 | { |
2172 | 0 | char *outstr; |
2173 | 0 | char buf[L_BIGBUF]; |
2174 | 0 | l_int32 i, n, linestart, xrefloc; |
2175 | 0 | SARRAY *sa; |
2176 | |
|
2177 | 0 | if (!daloc) |
2178 | 0 | return (char *)ERROR_PTR("daloc not defined", __func__, NULL); |
2179 | 0 | n = l_dnaGetCount(daloc) - 1; /* numbered objects + 1 (yes, +1) */ |
2180 | |
|
2181 | 0 | sa = sarrayCreate(0); |
2182 | 0 | snprintf(buf, sizeof(buf), "xref\n" |
2183 | 0 | "0 %d\n" |
2184 | 0 | "0000000000 65535 f \n", n); |
2185 | 0 | sarrayAddString(sa, buf, L_COPY); |
2186 | 0 | for (i = 1; i < n; i++) { |
2187 | 0 | l_dnaGetIValue(daloc, i, &linestart); |
2188 | 0 | snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart); |
2189 | 0 | sarrayAddString(sa, buf, L_COPY); |
2190 | 0 | } |
2191 | |
|
2192 | 0 | l_dnaGetIValue(daloc, n, &xrefloc); |
2193 | 0 | snprintf(buf, sizeof(buf), "trailer\n" |
2194 | 0 | "<<\n" |
2195 | 0 | "/Size %d\n" |
2196 | 0 | "/Root 1 0 R\n" |
2197 | 0 | "/Info 2 0 R\n" |
2198 | 0 | ">>\n" |
2199 | 0 | "startxref\n" |
2200 | 0 | "%d\n" |
2201 | 0 | "%%%%EOF\n", n, xrefloc); |
2202 | 0 | sarrayAddString(sa, buf, L_COPY); |
2203 | 0 | outstr = sarrayToString(sa, 0); |
2204 | 0 | sarrayDestroy(&sa); |
2205 | 0 | return outstr; |
2206 | 0 | } |
2207 | | |
2208 | | |
2209 | | /*! |
2210 | | * \brief generateOutputDataPdf() |
2211 | | * |
2212 | | * \param[out] pdata pdf data array |
2213 | | * \param[out] pnbytes size of pdf data array |
2214 | | * \param[in] lpd input data used to make pdf |
2215 | | * \return 0 if OK, 1 on error |
2216 | | * |
2217 | | * <pre> |
2218 | | * Notes: |
2219 | | * (1) Only called from l_generatePdf(). On error, no data is returned. |
2220 | | * </pre> |
2221 | | */ |
2222 | | static l_int32 |
2223 | | generateOutputDataPdf(l_uint8 **pdata, |
2224 | | size_t *pnbytes, |
2225 | | L_PDF_DATA *lpd) |
2226 | 0 | { |
2227 | 0 | char *str; |
2228 | 0 | l_uint8 *data; |
2229 | 0 | l_int32 nimages, i, len; |
2230 | 0 | l_int32 *sizes, *locs; |
2231 | 0 | size_t nbytes; |
2232 | 0 | L_COMP_DATA *cid; |
2233 | |
|
2234 | 0 | if (!pdata) |
2235 | 0 | return ERROR_INT("&data not defined", __func__, 1); |
2236 | 0 | *pdata = NULL; |
2237 | 0 | if (!pnbytes) |
2238 | 0 | return ERROR_INT("&nbytes not defined", __func__, 1); |
2239 | 0 | nbytes = lpd->xrefloc + strlen(lpd->trailer); |
2240 | 0 | *pnbytes = nbytes; |
2241 | 0 | if ((data = (l_uint8 *)LEPT_CALLOC(nbytes, sizeof(l_uint8))) == NULL) |
2242 | 0 | return ERROR_INT("calloc fail for data", __func__, 1); |
2243 | 0 | *pdata = data; |
2244 | |
|
2245 | 0 | sizes = l_dnaGetIArray(lpd->objsize); |
2246 | 0 | locs = l_dnaGetIArray(lpd->objloc); |
2247 | 0 | memcpy(data, lpd->id, sizes[0]); |
2248 | 0 | memcpy(data + locs[1], lpd->obj1, sizes[1]); |
2249 | 0 | memcpy(data + locs[2], lpd->obj2, sizes[2]); |
2250 | 0 | memcpy(data + locs[3], lpd->obj3, sizes[3]); |
2251 | 0 | memcpy(data + locs[4], lpd->obj4, sizes[4]); |
2252 | 0 | memcpy(data + locs[5], lpd->obj5, sizes[5]); |
2253 | | |
2254 | | /* Each image has 3 parts: variable preamble, the compressed |
2255 | | * data stream, and the fixed poststream. */ |
2256 | 0 | nimages = lpd->n; |
2257 | 0 | for (i = 0; i < nimages; i++) { |
2258 | 0 | if ((cid = pdfdataGetCid(lpd, i)) == NULL) { /* should not happen */ |
2259 | 0 | LEPT_FREE(sizes); |
2260 | 0 | LEPT_FREE(locs); |
2261 | 0 | return ERROR_INT("cid not found", __func__, 1); |
2262 | 0 | } |
2263 | 0 | str = sarrayGetString(lpd->saprex, i, L_NOCOPY); |
2264 | 0 | len = strlen(str); |
2265 | 0 | memcpy(data + locs[6 + i], str, len); |
2266 | 0 | memcpy(data + locs[6 + i] + len, |
2267 | 0 | cid->datacomp, cid->nbytescomp); |
2268 | 0 | memcpy(data + locs[6 + i] + len + cid->nbytescomp, |
2269 | 0 | lpd->poststream, strlen(lpd->poststream)); |
2270 | 0 | } |
2271 | | |
2272 | | /* Each colormap is simply a stored string */ |
2273 | 0 | for (i = 0; i < lpd->ncmap; i++) { |
2274 | 0 | str = sarrayGetString(lpd->sacmap, i, L_NOCOPY); |
2275 | 0 | memcpy(data + locs[6 + nimages + i], str, strlen(str)); |
2276 | 0 | } |
2277 | | |
2278 | | /* And finally the trailer */ |
2279 | 0 | memcpy(data + lpd->xrefloc, lpd->trailer, strlen(lpd->trailer)); |
2280 | 0 | LEPT_FREE(sizes); |
2281 | 0 | LEPT_FREE(locs); |
2282 | 0 | return 0; |
2283 | 0 | } |
2284 | | |
2285 | | |
2286 | | /*---------------------------------------------------------------------* |
2287 | | * Helper functions for generating multipage pdf output * |
2288 | | *---------------------------------------------------------------------*/ |
2289 | | /*! |
2290 | | * \brief parseTrailerPdf() |
2291 | | * |
2292 | | * \param[in] bas lba of a pdf file |
2293 | | * \param[out] pda byte locations of the beginning of each object |
2294 | | * \return 0 if OK, 1 on error |
2295 | | */ |
2296 | | static l_int32 |
2297 | | parseTrailerPdf(L_BYTEA *bas, |
2298 | | L_DNA **pda) |
2299 | 0 | { |
2300 | 0 | char *str; |
2301 | 0 | l_uint8 nl = '\n'; |
2302 | 0 | l_uint8 *data; |
2303 | 0 | l_int32 i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok; |
2304 | 0 | size_t size; |
2305 | 0 | L_DNA *da, *daobj, *daxref; |
2306 | 0 | SARRAY *sa; |
2307 | |
|
2308 | 0 | if (!pda) |
2309 | 0 | return ERROR_INT("&da not defined", __func__, 1); |
2310 | 0 | *pda = NULL; |
2311 | 0 | if (!bas) |
2312 | 0 | return ERROR_INT("bas not defined", __func__, 1); |
2313 | 0 | data = l_byteaGetData(bas, &size); |
2314 | 0 | if (memcmp(data, "%PDF-1.", 7) != 0) |
2315 | 0 | return ERROR_INT("PDF header signature not found", __func__, 1); |
2316 | | |
2317 | | /* Search for "startxref" starting 50 bytes from the EOF */ |
2318 | 0 | start = 0; |
2319 | 0 | if (size > 50) |
2320 | 0 | start = size - 50; |
2321 | 0 | arrayFindSequence(data + start, size - start, |
2322 | 0 | (l_uint8 *)"startxref\n", 10, &loc, &found); |
2323 | 0 | if (!found) |
2324 | 0 | return ERROR_INT("startxref not found!", __func__, 1); |
2325 | 0 | if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1) |
2326 | 0 | return ERROR_INT("xrefloc not found!", __func__, 1); |
2327 | 0 | if (xrefloc < 0 || xrefloc >= size) |
2328 | 0 | return ERROR_INT("invalid xrefloc!", __func__, 1); |
2329 | 0 | sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0); |
2330 | 0 | str = sarrayGetString(sa, 1, L_NOCOPY); |
2331 | 0 | if ((sscanf(str, "0 %d", &nobj)) != 1) { |
2332 | 0 | sarrayDestroy(&sa); |
2333 | 0 | return ERROR_INT("nobj not found", __func__, 1); |
2334 | 0 | } |
2335 | | |
2336 | | /* Get starting locations. The numa index is the |
2337 | | * object number. loc[0] is the ID; loc[nobj + 1] is xrefloc. */ |
2338 | 0 | da = l_dnaCreate(nobj + 1); |
2339 | 0 | *pda = da; |
2340 | 0 | for (i = 0; i < nobj; i++) { |
2341 | 0 | str = sarrayGetString(sa, i + 2, L_NOCOPY); |
2342 | 0 | sscanf(str, "%d", &startloc); |
2343 | 0 | l_dnaAddNumber(da, startloc); |
2344 | 0 | } |
2345 | 0 | l_dnaAddNumber(da, xrefloc); |
2346 | |
|
2347 | | #if DEBUG_MULTIPAGE |
2348 | | lept_stderr("************** Trailer string ************\n"); |
2349 | | lept_stderr("xrefloc = %d", xrefloc); |
2350 | | sarrayWriteStderr(sa); |
2351 | | |
2352 | | lept_stderr("************** Object locations ************"); |
2353 | | l_dnaWriteStderr(da); |
2354 | | #endif /* DEBUG_MULTIPAGE */ |
2355 | 0 | sarrayDestroy(&sa); |
2356 | | |
2357 | | /* Verify correct parsing */ |
2358 | 0 | trailer_ok = TRUE; |
2359 | 0 | for (i = 1; i < nobj; i++) { |
2360 | 0 | l_dnaGetIValue(da, i, &startloc); |
2361 | 0 | if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) { |
2362 | 0 | L_ERROR("bad trailer for object %d\n", __func__, i); |
2363 | 0 | trailer_ok = FALSE; |
2364 | 0 | break; |
2365 | 0 | } |
2366 | 0 | } |
2367 | | |
2368 | | /* If the trailer is broken, reconstruct the correct obj locations */ |
2369 | 0 | if (!trailer_ok) { |
2370 | 0 | L_INFO("rebuilding pdf trailer\n", __func__); |
2371 | 0 | l_dnaEmpty(da); |
2372 | 0 | l_dnaAddNumber(da, 0); |
2373 | 0 | l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &daobj); |
2374 | 0 | nobj = l_dnaGetCount(daobj); |
2375 | 0 | for (i = 0; i < nobj; i++) { |
2376 | 0 | l_dnaGetIValue(daobj, i, &loc); |
2377 | 0 | for (j = loc - 1; j > 0; j--) { |
2378 | 0 | if (data[j] == nl) |
2379 | 0 | break; |
2380 | 0 | } |
2381 | 0 | l_dnaAddNumber(da, j + 1); |
2382 | 0 | } |
2383 | 0 | l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &daxref); |
2384 | 0 | l_dnaGetIValue(daxref, 0, &loc); |
2385 | 0 | l_dnaAddNumber(da, loc); |
2386 | 0 | l_dnaDestroy(&daobj); |
2387 | 0 | l_dnaDestroy(&daxref); |
2388 | 0 | } |
2389 | |
|
2390 | 0 | return 0; |
2391 | 0 | } |
2392 | | |
2393 | | |
2394 | | static char * |
2395 | | generatePagesObjStringPdf(NUMA *napage) |
2396 | 0 | { |
2397 | 0 | char *str; |
2398 | 0 | char *buf; |
2399 | 0 | l_int32 i, n, index, bufsize; |
2400 | 0 | SARRAY *sa; |
2401 | |
|
2402 | 0 | if (!napage) |
2403 | 0 | return (char *)ERROR_PTR("napage not defined", __func__, NULL); |
2404 | | |
2405 | 0 | n = numaGetCount(napage); |
2406 | 0 | bufsize = 100 + 16 * n; /* large enough to hold the output string */ |
2407 | 0 | buf = (char *)LEPT_CALLOC(bufsize, sizeof(char)); |
2408 | 0 | sa = sarrayCreate(n); |
2409 | 0 | for (i = 0; i < n; i++) { |
2410 | 0 | numaGetIValue(napage, i, &index); |
2411 | 0 | snprintf(buf, bufsize, " %d 0 R ", index); |
2412 | 0 | sarrayAddString(sa, buf, L_COPY); |
2413 | 0 | } |
2414 | |
|
2415 | 0 | str = sarrayToString(sa, 0); |
2416 | 0 | snprintf(buf, bufsize - 1, "3 0 obj\n" |
2417 | 0 | "<<\n" |
2418 | 0 | "/Type /Pages\n" |
2419 | 0 | "/Kids [%s]\n" |
2420 | 0 | "/Count %d\n" |
2421 | 0 | ">>\n" |
2422 | 0 | "endobj\n", |
2423 | 0 | str, n); |
2424 | 0 | sarrayDestroy(&sa); |
2425 | 0 | LEPT_FREE(str); |
2426 | 0 | return buf; |
2427 | 0 | } |
2428 | | |
2429 | | |
2430 | | /*! |
2431 | | * \brief substituteObjectNumbers() |
2432 | | * |
2433 | | * \param[in] bas lba of a pdf object |
2434 | | * \param[in] na_objs object number mapping array |
2435 | | * \return bad lba of rewritten pdf for the object |
2436 | | * |
2437 | | * <pre> |
2438 | | * Notes: |
2439 | | * (1) Interpret the first set of bytes as the object number, |
2440 | | * map to the new number, and write it out. |
2441 | | * (2) Find all occurrences of this 4-byte sequence: " 0 R" |
2442 | | * (3) Find the location and value of the integer preceding this, |
2443 | | * and map it to the new value. |
2444 | | * (4) Rewrite the object with new object numbers. |
2445 | | * </pre> |
2446 | | */ |
2447 | | static L_BYTEA * |
2448 | | substituteObjectNumbers(L_BYTEA *bas, |
2449 | | NUMA *na_objs) |
2450 | 0 | { |
2451 | 0 | l_uint8 space = ' '; |
2452 | 0 | l_uint8 *datas; |
2453 | 0 | l_uint8 buf[32]; /* only needs to hold one integer in ascii format */ |
2454 | 0 | l_int32 start, nrepl, i, j, nobjs, objin, objout, found; |
2455 | 0 | l_int32 *objs, *matches; |
2456 | 0 | size_t size; |
2457 | 0 | L_BYTEA *bad; |
2458 | 0 | L_DNA *da_match; |
2459 | |
|
2460 | 0 | if (!bas) |
2461 | 0 | return (L_BYTEA *)ERROR_PTR("bas not defined", __func__, NULL); |
2462 | 0 | if (!na_objs) |
2463 | 0 | return (L_BYTEA *)ERROR_PTR("na_objs not defined", __func__, NULL); |
2464 | | |
2465 | 0 | datas = l_byteaGetData(bas, &size); |
2466 | 0 | bad = l_byteaCreate(100); |
2467 | 0 | objs = numaGetIArray(na_objs); /* object number mapper */ |
2468 | 0 | nobjs = numaGetCount(na_objs); /* use for sanity checking */ |
2469 | | |
2470 | | /* Substitute the object number on the first line */ |
2471 | 0 | sscanf((char *)datas, "%d", &objin); |
2472 | 0 | if (objin < 0 || objin >= nobjs) { |
2473 | 0 | L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs); |
2474 | 0 | LEPT_FREE(objs); |
2475 | 0 | return bad; |
2476 | 0 | } |
2477 | 0 | objout = objs[objin]; |
2478 | 0 | snprintf((char *)buf, 32, "%d", objout); |
2479 | 0 | l_byteaAppendString(bad, (char *)buf); |
2480 | | |
2481 | | /* Find the set of matching locations for object references */ |
2482 | 0 | arrayFindSequence(datas, size, &space, 1, &start, &found); |
2483 | 0 | da_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4); |
2484 | 0 | if (!da_match) { |
2485 | 0 | l_byteaAppendData(bad, datas + start, size - start); |
2486 | 0 | LEPT_FREE(objs); |
2487 | 0 | return bad; |
2488 | 0 | } |
2489 | | |
2490 | | /* Substitute all the object reference numbers */ |
2491 | 0 | nrepl = l_dnaGetCount(da_match); |
2492 | 0 | matches = l_dnaGetIArray(da_match); |
2493 | 0 | for (i = 0; i < nrepl; i++) { |
2494 | | /* Find the first space before the object number */ |
2495 | 0 | for (j = matches[i] - 1; j > 0; j--) { |
2496 | 0 | if (datas[j] == space) |
2497 | 0 | break; |
2498 | 0 | } |
2499 | | /* Copy bytes from 'start' up to the object number */ |
2500 | 0 | l_byteaAppendData(bad, datas + start, j - start + 1); |
2501 | 0 | sscanf((char *)(datas + j + 1), "%d", &objin); |
2502 | 0 | if (objin < 0 || objin >= nobjs) { |
2503 | 0 | L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs); |
2504 | 0 | LEPT_FREE(objs); |
2505 | 0 | LEPT_FREE(matches); |
2506 | 0 | l_dnaDestroy(&da_match); |
2507 | 0 | return bad; |
2508 | 0 | } |
2509 | 0 | objout = objs[objin]; |
2510 | 0 | snprintf((char *)buf, 32, "%d", objout); |
2511 | 0 | l_byteaAppendString(bad, (char *)buf); |
2512 | 0 | start = matches[i]; |
2513 | 0 | } |
2514 | 0 | l_byteaAppendData(bad, datas + start, size - start); |
2515 | |
|
2516 | 0 | LEPT_FREE(objs); |
2517 | 0 | LEPT_FREE(matches); |
2518 | 0 | l_dnaDestroy(&da_match); |
2519 | 0 | return bad; |
2520 | 0 | } |
2521 | | |
2522 | | |
2523 | | /*---------------------------------------------------------------------* |
2524 | | * Create/destroy/access pdf data * |
2525 | | *---------------------------------------------------------------------*/ |
2526 | | static L_PDF_DATA * |
2527 | | pdfdataCreate(const char *title) |
2528 | 0 | { |
2529 | 0 | L_PDF_DATA *lpd; |
2530 | |
|
2531 | 0 | lpd = (L_PDF_DATA *)LEPT_CALLOC(1, sizeof(L_PDF_DATA)); |
2532 | 0 | if (title) lpd->title = stringNew(title); |
2533 | 0 | lpd->cida = ptraCreate(10); |
2534 | 0 | lpd->xy = ptaCreate(10); |
2535 | 0 | lpd->wh = ptaCreate(10); |
2536 | 0 | lpd->saprex = sarrayCreate(10); |
2537 | 0 | lpd->sacmap = sarrayCreate(10); |
2538 | 0 | lpd->objsize = l_dnaCreate(20); |
2539 | 0 | lpd->objloc = l_dnaCreate(20); |
2540 | 0 | return lpd; |
2541 | 0 | } |
2542 | | |
2543 | | static void |
2544 | | pdfdataDestroy(L_PDF_DATA **plpd) |
2545 | 0 | { |
2546 | 0 | l_int32 i; |
2547 | 0 | L_COMP_DATA *cid; |
2548 | 0 | L_PDF_DATA *lpd; |
2549 | |
|
2550 | 0 | if (plpd== NULL) { |
2551 | 0 | L_WARNING("ptr address is null!\n", __func__); |
2552 | 0 | return; |
2553 | 0 | } |
2554 | 0 | if ((lpd = *plpd) == NULL) |
2555 | 0 | return; |
2556 | | |
2557 | 0 | if (lpd->title) LEPT_FREE(lpd->title); |
2558 | 0 | for (i = 0; i < lpd->n; i++) { |
2559 | 0 | cid = (L_COMP_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION); |
2560 | 0 | l_CIDataDestroy(&cid); |
2561 | 0 | } |
2562 | |
|
2563 | 0 | ptraDestroy(&lpd->cida, 0, 0); |
2564 | 0 | if (lpd->id) LEPT_FREE(lpd->id); |
2565 | 0 | if (lpd->obj1) LEPT_FREE(lpd->obj1); |
2566 | 0 | if (lpd->obj2) LEPT_FREE(lpd->obj2); |
2567 | 0 | if (lpd->obj3) LEPT_FREE(lpd->obj3); |
2568 | 0 | if (lpd->obj4) LEPT_FREE(lpd->obj4); |
2569 | 0 | if (lpd->obj5) LEPT_FREE(lpd->obj5); |
2570 | 0 | if (lpd->poststream) LEPT_FREE(lpd->poststream); |
2571 | 0 | if (lpd->trailer) LEPT_FREE(lpd->trailer); |
2572 | 0 | if (lpd->xy) ptaDestroy(&lpd->xy); |
2573 | 0 | if (lpd->wh) ptaDestroy(&lpd->wh); |
2574 | 0 | if (lpd->mediabox) boxDestroy(&lpd->mediabox); |
2575 | 0 | if (lpd->saprex) sarrayDestroy(&lpd->saprex); |
2576 | 0 | if (lpd->sacmap) sarrayDestroy(&lpd->sacmap); |
2577 | 0 | if (lpd->objsize) l_dnaDestroy(&lpd->objsize); |
2578 | 0 | if (lpd->objloc) l_dnaDestroy(&lpd->objloc); |
2579 | 0 | LEPT_FREE(lpd); |
2580 | 0 | *plpd = NULL; |
2581 | 0 | } |
2582 | | |
2583 | | |
2584 | | static L_COMP_DATA * |
2585 | | pdfdataGetCid(L_PDF_DATA *lpd, |
2586 | | l_int32 index) |
2587 | 0 | { |
2588 | 0 | if (!lpd) |
2589 | 0 | return (L_COMP_DATA *)ERROR_PTR("lpd not defined", __func__, NULL); |
2590 | 0 | if (index < 0 || index >= lpd->n) |
2591 | 0 | return (L_COMP_DATA *)ERROR_PTR("invalid image index", __func__, NULL); |
2592 | | |
2593 | 0 | return (L_COMP_DATA *)ptraGetPtrToItem(lpd->cida, index); |
2594 | 0 | } |
2595 | | |
2596 | | |
2597 | | /*---------------------------------------------------------------------* |
2598 | | * Find number of pages in a pdf * |
2599 | | *---------------------------------------------------------------------*/ |
2600 | | /*! |
2601 | | * \brief getPdfPageCount() |
2602 | | * |
2603 | | * \param[in] fname filename |
2604 | | * \param[out] pnpages number of pages |
2605 | | * \return 0 if OK, 1 on error |
2606 | | * |
2607 | | * <pre> |
2608 | | * Notes: |
2609 | | * (1) Looks for the argument of the first instance of /Count in the file. |
2610 | | * (2) This first reads 10000 bytes from the beginning of the file. |
2611 | | * If "/Count" is not in that string, it reads the entire file |
2612 | | * and looks for "/Count". |
2613 | | * (3) This will not work on encrypted pdf files or on files where |
2614 | | * the "/Count" field is binary compressed. Not finding the |
2615 | | * "/Count" field is not an error, but a warning is given. |
2616 | | * </pre> |
2617 | | */ |
2618 | | l_ok |
2619 | | getPdfPageCount(const char *fname, |
2620 | | l_int32 *pnpages) |
2621 | 0 | { |
2622 | 0 | l_uint8 *data; |
2623 | 0 | l_int32 format, loc, ret, npages, found; |
2624 | 0 | size_t nread; |
2625 | |
|
2626 | 0 | if (!pnpages) |
2627 | 0 | return ERROR_INT("&npages not defined", __func__, 1); |
2628 | 0 | *pnpages = 0; |
2629 | 0 | if (!fname) |
2630 | 0 | return ERROR_INT("fname not defined", __func__, 1); |
2631 | | |
2632 | | /* Make sure this a pdf file */ |
2633 | 0 | findFileFormat(fname, &format); |
2634 | 0 | if (format != IFF_LPDF) |
2635 | 0 | return ERROR_INT("file is not pdf", __func__, 1); |
2636 | | |
2637 | | /* Read 10000 bytes from the beginning of the file */ |
2638 | 0 | if ((data = l_binaryReadSelect(fname, 0, 10000, &nread)) |
2639 | 0 | == NULL) |
2640 | 0 | return ERROR_INT("partial data not read", __func__, 1); |
2641 | | |
2642 | | /* Find the location of the first instance of "/Count". |
2643 | | * If it is not found, try reading the entire file and |
2644 | | * looking again. */ |
2645 | 0 | arrayFindSequence(data, nread, (const l_uint8 *)"/Count", |
2646 | 0 | strlen("/Count"), &loc, &found); |
2647 | 0 | if (!found) { |
2648 | 0 | lept_stderr("Reading entire file looking for '/Count'\n"); |
2649 | 0 | LEPT_FREE(data); |
2650 | 0 | if ((data = l_binaryRead(fname, &nread)) == NULL) |
2651 | 0 | return ERROR_INT("full data not read", __func__, 1); |
2652 | 0 | arrayFindSequence(data, nread, (const l_uint8 *)"/Count", |
2653 | 0 | strlen("/Count"), &loc, &found); |
2654 | 0 | if (!found) { |
2655 | 0 | LEPT_FREE(data); |
2656 | 0 | L_WARNING("/Count not found\n", __func__); |
2657 | 0 | return 0; |
2658 | 0 | } |
2659 | 0 | } |
2660 | | |
2661 | | /* Unlikely: make sure we can read the count field */ |
2662 | 0 | if (nread - loc < 12) { /* haven't read enough to capture page count */ |
2663 | 0 | LEPT_FREE(data); |
2664 | 0 | return ERROR_INT("data may not include page count field", __func__, 1); |
2665 | 0 | } |
2666 | | |
2667 | | /* Read the page count; if not found, puts garbage in npages */ |
2668 | 0 | ret = sscanf((char *)&data[loc], "/Count %d", &npages); |
2669 | 0 | LEPT_FREE(data); |
2670 | 0 | if (ret != 1) |
2671 | 0 | return ERROR_INT("npages not found", __func__, 1); |
2672 | 0 | *pnpages = npages; |
2673 | | /* lept_stderr("bytes read = %d, loc = %d, npages = %d\n", |
2674 | | nread, loc, *pnpages); */ |
2675 | 0 | return 0; |
2676 | 0 | } |
2677 | | |
2678 | | |
2679 | | /*---------------------------------------------------------------------* |
2680 | | * Find widths and heights of pages and media boxes in a pdf * |
2681 | | *---------------------------------------------------------------------*/ |
2682 | | /*! |
2683 | | * \brief getPdfPageSizes() |
2684 | | * |
2685 | | * \param[in] fname filename |
2686 | | * \param[out] pnaw [optional] array of page widths |
2687 | | * \param[out] pnah [optional] array of page heights |
2688 | | * \param[out] pmedw [optional] median page width |
2689 | | * \param[out] pmedh [optional] median page height |
2690 | | * \return 0 if OK, 1 on error |
2691 | | * |
2692 | | * <pre> |
2693 | | * Notes: |
2694 | | * (1) Finds the arguments of each instance of '/Width' and '/Height' |
2695 | | * in the file. |
2696 | | * (2) This will not work on encrypted pdf files or on files where |
2697 | | * the "/Width" and "/Height" fields are binary compressed. |
2698 | | * Not finding the "/Width" and /Height" fields is not an error, |
2699 | | * but a warning is given. |
2700 | | * </pre> |
2701 | | */ |
2702 | | l_ok |
2703 | | getPdfPageSizes(const char *fname, |
2704 | | NUMA **pnaw, |
2705 | | NUMA **pnah, |
2706 | | l_int32 *pmedw, |
2707 | | l_int32 *pmedh) |
2708 | 0 | { |
2709 | 0 | l_uint8 *data; |
2710 | 0 | l_int32 i, nw, nh, format, ret, loc, width, height; |
2711 | 0 | l_float32 fval; |
2712 | 0 | size_t nread; |
2713 | 0 | L_DNA *dnaw; /* width locations */ |
2714 | 0 | L_DNA *dnah; /* height locations */ |
2715 | 0 | NUMA *naw; /* widths */ |
2716 | 0 | NUMA *nah; /* heights */ |
2717 | |
|
2718 | 0 | if (pnaw) *pnaw = NULL; |
2719 | 0 | if (pnah) *pnah = NULL; |
2720 | 0 | if (pmedw) *pmedw = 0; |
2721 | 0 | if (pmedh) *pmedh = 0; |
2722 | 0 | if (!pnaw && !pnah && !pmedw && !pmedh) |
2723 | 0 | return ERROR_INT("no output requested", __func__, 1); |
2724 | 0 | if (!fname) |
2725 | 0 | return ERROR_INT("fname not defined", __func__, 1); |
2726 | | |
2727 | | /* Make sure this a pdf file */ |
2728 | 0 | findFileFormat(fname, &format); |
2729 | 0 | if (format != IFF_LPDF) |
2730 | 0 | return ERROR_INT("file is not pdf", __func__, 1); |
2731 | | |
2732 | | /* Read the file into memory and find all locations of |
2733 | | * '/Width' and '/Height' */ |
2734 | 0 | if ((data = l_binaryRead(fname, &nread)) == NULL) |
2735 | 0 | return ERROR_INT("full data not read", __func__, 1); |
2736 | 0 | dnaw = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Width", |
2737 | 0 | strlen("/Width")); |
2738 | 0 | dnah = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Height", |
2739 | 0 | strlen("/Height")); |
2740 | 0 | if (!dnaw) |
2741 | 0 | L_WARNING("unable to find widths\n", __func__); |
2742 | 0 | if (!dnah) |
2743 | 0 | L_WARNING("unable to find heights\n", __func__); |
2744 | 0 | if (!dnaw && !dnah) { |
2745 | 0 | LEPT_FREE(data); |
2746 | 0 | L_WARNING("no fields found\n", __func__); |
2747 | 0 | return 0; |
2748 | 0 | } |
2749 | | |
2750 | | /* Find the page widths and heights */ |
2751 | 0 | nw = l_dnaGetCount(dnaw); |
2752 | 0 | naw = numaCreate(nw); |
2753 | 0 | for (i = 0; i < nw; i++) { |
2754 | 0 | l_dnaGetIValue(dnaw, i, &loc); |
2755 | 0 | ret = sscanf((char *)&data[loc], "/Width %d", &width); |
2756 | 0 | if (ret != 1) { |
2757 | 0 | L_ERROR("width not found for item %d at loc %d\n", |
2758 | 0 | __func__, i, loc); |
2759 | 0 | continue; |
2760 | 0 | } |
2761 | 0 | numaAddNumber(naw, width); |
2762 | 0 | } |
2763 | 0 | nh = l_dnaGetCount(dnah); |
2764 | 0 | nah = numaCreate(nh); |
2765 | 0 | for (i = 0; i < nh; i++) { |
2766 | 0 | l_dnaGetIValue(dnah, i, &loc); |
2767 | 0 | ret = sscanf((char *)&data[loc], "/Height %d", &height); |
2768 | 0 | if (ret != 1) { |
2769 | 0 | L_ERROR("height not found for item %d at loc %d\n", |
2770 | 0 | __func__, i, loc); |
2771 | 0 | continue; |
2772 | 0 | } |
2773 | 0 | numaAddNumber(nah, height); |
2774 | 0 | } |
2775 | |
|
2776 | 0 | LEPT_FREE(data); |
2777 | 0 | l_dnaDestroy(&dnaw); |
2778 | 0 | l_dnaDestroy(&dnah); |
2779 | 0 | if (pmedw) { |
2780 | 0 | numaGetMedian(naw, &fval); |
2781 | 0 | *pmedw = lept_roundftoi(fval); |
2782 | 0 | } |
2783 | 0 | if (pnaw) |
2784 | 0 | *pnaw = naw; |
2785 | 0 | else |
2786 | 0 | numaDestroy(&naw); |
2787 | 0 | if (pmedh) { |
2788 | 0 | numaGetMedian(nah, &fval); |
2789 | 0 | *pmedh = lept_roundftoi(fval); |
2790 | 0 | } |
2791 | 0 | if (pnah) |
2792 | 0 | *pnah = nah; |
2793 | 0 | else |
2794 | 0 | numaDestroy(&nah); |
2795 | 0 | return 0; |
2796 | 0 | } |
2797 | | |
2798 | | |
2799 | | /*! |
2800 | | * \brief getPdfMediaBoxSizes() |
2801 | | * |
2802 | | * \param[in] fname filename |
2803 | | * \param[out] pnaw [optional] array of mediabox widths |
2804 | | * \param[out] pnah [optional] array of mediabox heights |
2805 | | * \param[out] pmedw [optional] median mediabox width |
2806 | | * \param[out] pmedh [optional] median mediabox height |
2807 | | * \return 0 if OK, 1 on error |
2808 | | * |
2809 | | * <pre> |
2810 | | * Notes: |
2811 | | * (1) Finds the arguments of each instance of '/MediaBox' in the file. |
2812 | | * (2) This will not work on encrypted pdf files or on files where |
2813 | | * the "/MediaBoxes" field is binary compressed. Not finding |
2814 | | * the "/MediaBoxes" field is not an error, but a warning is given. |
2815 | | * (3) This is useful for determining if the media boxes are |
2816 | | * incorrectly assigned, such as assuming the resolution is 72 ppi. |
2817 | | * If that happens and the input the the renderer assumes the |
2818 | | * resolution is 300 ppi, the rendered images will be over 4x too |
2819 | | * large in each dimension. |
2820 | | * (4) An image dimension of 11 inches corresponds to a MediaBox |
2821 | | * parameter of 792. We consider a value > 850 to be oversized |
2822 | | * and not to be taken literally. |
2823 | | * </pre> |
2824 | | */ |
2825 | | l_ok |
2826 | | getPdfMediaBoxSizes(const char *fname, |
2827 | | NUMA **pnaw, |
2828 | | NUMA **pnah, |
2829 | | l_int32 *pmedw, |
2830 | | l_int32 *pmedh) |
2831 | 0 | { |
2832 | 0 | l_uint8 *data; |
2833 | 0 | l_int32 i, n, format, ret, loc; |
2834 | 0 | l_float32 fval, ignore1, ignore2, w, h; |
2835 | 0 | size_t nread; |
2836 | 0 | L_DNA *dna; /* mediabox locations */ |
2837 | 0 | NUMA *naw; /* mediabox widths */ |
2838 | 0 | NUMA *nah; /* mediabox heights */ |
2839 | |
|
2840 | 0 | if (pnaw) *pnaw = NULL; |
2841 | 0 | if (pnah) *pnah = NULL; |
2842 | 0 | if (pmedw) *pmedw = 0; |
2843 | 0 | if (pmedh) *pmedh = 0; |
2844 | 0 | if (!pnaw && !pnah && !pmedw && !pmedh) |
2845 | 0 | return ERROR_INT("no output requested", __func__, 1); |
2846 | 0 | if (!fname) |
2847 | 0 | return ERROR_INT("fname not defined", __func__, 1); |
2848 | | |
2849 | | /* Make sure this a pdf file */ |
2850 | 0 | findFileFormat(fname, &format); |
2851 | 0 | if (format != IFF_LPDF) |
2852 | 0 | return ERROR_INT("file is not pdf", __func__, 1); |
2853 | | |
2854 | | /* Read the file into memory and find all locations of '/MediaBox' */ |
2855 | 0 | if ((data = l_binaryRead(fname, &nread)) == NULL) |
2856 | 0 | return ERROR_INT("full data not read", __func__, 1); |
2857 | 0 | dna = arrayFindEachSequence(data, nread, (const l_uint8 *)"/MediaBox", |
2858 | 0 | strlen("/MediaBox")); |
2859 | 0 | if (!dna) { |
2860 | 0 | LEPT_FREE(data); |
2861 | 0 | L_WARNING("no mediaboxes found\n", __func__); |
2862 | 0 | return 1; |
2863 | 0 | } |
2864 | | |
2865 | | /* Find the mediabox widths and heights */ |
2866 | 0 | n = l_dnaGetCount(dna); |
2867 | 0 | naw = numaCreate(n); |
2868 | 0 | nah = numaCreate(n); |
2869 | 0 | for (i = 0; i < n; i++) { |
2870 | 0 | l_dnaGetIValue(dna, i, &loc); |
2871 | 0 | ret = sscanf((char *)&data[loc], "/MediaBox [ %f %f %f %f", |
2872 | 0 | &ignore1, &ignore2, &w, &h); |
2873 | 0 | if (ret != 4) { |
2874 | 0 | L_ERROR("mediabox sizes not found for item %d at loc %d\n", |
2875 | 0 | __func__, i, loc); |
2876 | 0 | continue; |
2877 | 0 | } |
2878 | 0 | numaAddNumber(naw, w); |
2879 | 0 | numaAddNumber(nah, h); |
2880 | 0 | } |
2881 | 0 | LEPT_FREE(data); |
2882 | 0 | l_dnaDestroy(&dna); |
2883 | |
|
2884 | 0 | if (pmedw) { |
2885 | 0 | numaGetMedian(naw, &fval); |
2886 | 0 | *pmedw = lept_roundftoi(fval); |
2887 | 0 | if (*pmedw > 850) lept_stderr("oversize width: %d\n", *pmedw); |
2888 | 0 | } |
2889 | 0 | if (pnaw) |
2890 | 0 | *pnaw = naw; |
2891 | 0 | else |
2892 | 0 | numaDestroy(&naw); |
2893 | 0 | if (pmedh) { |
2894 | 0 | numaGetMedian(nah, &fval); |
2895 | 0 | *pmedh = lept_roundftoi(fval); |
2896 | 0 | if (*pmedh > 850) lept_stderr("oversize height: %d\n", *pmedh); |
2897 | 0 | } |
2898 | 0 | if (pnah) |
2899 | 0 | *pnah = nah; |
2900 | 0 | else |
2901 | 0 | numaDestroy(&nah); |
2902 | 0 | return 0; |
2903 | 0 | } |
2904 | | |
2905 | | |
2906 | | /*---------------------------------------------------------------------* |
2907 | | * Find effective resolution of images rendered from a pdf * |
2908 | | *---------------------------------------------------------------------*/ |
2909 | | /*! |
2910 | | * \brief getPdfRendererResolution() |
2911 | | * |
2912 | | * \param[in] infile filename of input pdf file |
2913 | | * \param[in] outdir directory of rendered output images |
2914 | | * \param[out] pres desired resolution to use with renderer |
2915 | | * \return 0 if OK, 1 on error |
2916 | | * |
2917 | | * <pre> |
2918 | | * Notes: |
2919 | | * (1) Finds the input resolution to pdftoppm that will generate |
2920 | | * images with a maximum dimension of about 3300 pixels, |
2921 | | * representing a full page at 300 ppi. |
2922 | | * (2) It is most important is to make sure the renderer does |
2923 | | * not make huge images because of an error in /MediaBox. |
2924 | | * An image dimension of 11 inches corresponds to a MediaBox |
2925 | | * parameter of 792. We consider a value > 850 to be oversized |
2926 | | * and not to be taken literally. If the mediaboxes are |
2927 | | * oversized, choose an appropriate lower resolution. |
2928 | | * (3) If the mediaboxes are not accessible, render an image at |
2929 | | * a low known resolution (say, 72 ppi) and based on the image |
2930 | | * size, determine the resolution necessary to make an image |
2931 | | * with 3300 pixels in the largest dimension. |
2932 | | * (4) Requires pdftoppm, so this is disabled on windows for now. |
2933 | | * (5) Requires the ability to call an external program, so it is |
2934 | | * necessary to call setLeptDebugOK(1) before this function. |
2935 | | * </pre> |
2936 | | */ |
2937 | | l_ok |
2938 | | getPdfRendererResolution(const char *infile, |
2939 | | const char *outdir, |
2940 | | l_int32 *pres) |
2941 | 0 | { |
2942 | 0 | char buf[256]; |
2943 | 0 | char *tail, *basename, *fname; |
2944 | 0 | l_int32 ret, res, medw, medh, medmax, npages, pageno, w, h; |
2945 | 0 | SARRAY *sa; |
2946 | |
|
2947 | 0 | if (!pres) |
2948 | 0 | return ERROR_INT("&res not defined", __func__, 1); |
2949 | 0 | *pres = 300; /* default */ |
2950 | |
|
2951 | | #ifdef _WIN32 |
2952 | | L_INFO("Requires pdftoppm, so this is disabled on windows.\n" |
2953 | | "Returns default resolution 300 ppi", __func__); |
2954 | | return 0; |
2955 | | #endif /* _WIN32 */ |
2956 | |
|
2957 | 0 | if (!LeptDebugOK) { |
2958 | 0 | L_INFO("Running pdftoppm is disabled; " |
2959 | 0 | "use setLeptDebugOK(1) to enable\n", |
2960 | 0 | "returns default resolution 300 ppi\n", __func__); |
2961 | 0 | return 1; |
2962 | 0 | } |
2963 | | |
2964 | 0 | if (!infile) |
2965 | 0 | return ERROR_INT("infile not defined", __func__, 1); |
2966 | 0 | if (!outdir) |
2967 | 0 | return ERROR_INT("outdir not defined", __func__, 1); |
2968 | | |
2969 | 0 | res = 300; /* default value */ |
2970 | 0 | ret = getPdfMediaBoxSizes(infile, NULL, NULL, &medw, &medh); |
2971 | 0 | if (ret == 0) { /* Check for oversize mediaboxes */ |
2972 | 0 | lept_stderr("Media Box medians: medw = %d, medh = %d\n", medw, medh); |
2973 | 0 | medmax = L_MAX(medw, medh); |
2974 | 0 | if (medmax > 850) { |
2975 | 0 | res = 300 * ((l_float32)792 / (l_float32)medmax); |
2976 | 0 | lept_stderr(" Oversize media box; use resolution = %d\n", res); |
2977 | 0 | *pres = res; |
2978 | 0 | } |
2979 | 0 | return 0; |
2980 | 0 | } |
2981 | | |
2982 | | /* No mediaboxes; render one page and measure the max dimension */ |
2983 | 0 | lept_stderr("Media Box dimensions not found\n"); |
2984 | 0 | getPdfPageCount(infile, &npages); |
2985 | 0 | pageno = (npages > 0) ? (npages + 1) / 2 : 1; |
2986 | 0 | splitPathAtDirectory(infile, NULL, &tail); |
2987 | 0 | splitPathAtExtension(tail, &basename, NULL); |
2988 | 0 | snprintf(buf, sizeof(buf), "pdftoppm -f %d -l %d -r 72 %s %s/%s", |
2989 | 0 | pageno, pageno, infile, outdir, basename); |
2990 | 0 | LEPT_FREE(tail); |
2991 | 0 | LEPT_FREE(basename); |
2992 | 0 | callSystemDebug(buf); /* pdftoppm */ |
2993 | | |
2994 | | /* Get the page size */ |
2995 | 0 | sa = getSortedPathnamesInDirectory(outdir, NULL, 0, 0); |
2996 | 0 | fname = sarrayGetString(sa, 0, L_NOCOPY); |
2997 | 0 | pixReadHeader(fname, NULL, &w, &h, NULL, NULL, NULL); |
2998 | 0 | sarrayDestroy(&sa); |
2999 | 0 | if (w > 0 && h > 0) { |
3000 | 0 | res = L_MIN((72 * 3300 / L_MAX(w, h)), 600); |
3001 | 0 | *pres = res; |
3002 | 0 | lept_stderr("Use resolution = %d\n", res); |
3003 | 0 | } else { |
3004 | 0 | L_ERROR("page size not found; assuming res = 300\n", __func__); |
3005 | 0 | } |
3006 | |
|
3007 | 0 | return 0; |
3008 | 0 | } |
3009 | | |
3010 | | |
3011 | | /*---------------------------------------------------------------------* |
3012 | | * Set flags for special modes * |
3013 | | *---------------------------------------------------------------------*/ |
3014 | | /*! |
3015 | | * \brief l_pdfSetG4ImageMask() |
3016 | | * |
3017 | | * \param[in] flag 1 for writing g4 data as fg only through a mask; |
3018 | | * 0 for writing fg and bg |
3019 | | * \return void |
3020 | | * |
3021 | | * <pre> |
3022 | | * Notes: |
3023 | | * (1) The default is for writing only the fg (through the mask). |
3024 | | * That way when you write a 1 bpp image, the bg is transparent, |
3025 | | * so any previously written image remains visible behind it. |
3026 | | * </pre> |
3027 | | */ |
3028 | | void |
3029 | | l_pdfSetG4ImageMask(l_int32 flag) |
3030 | 0 | { |
3031 | 0 | var_WRITE_G4_IMAGE_MASK = flag; |
3032 | 0 | } |
3033 | | |
3034 | | |
3035 | | /*! |
3036 | | * \brief l_pdfSetDateAndVersion() |
3037 | | * |
3038 | | * \param[in] flag 1 for writing date/time and leptonica version; |
3039 | | * 0 for omitting this from the metadata |
3040 | | * \return void |
3041 | | * |
3042 | | * <pre> |
3043 | | * Notes: |
3044 | | * (1) The default is for writing this data. For regression tests |
3045 | | * that compare output against golden files, it is useful to omit. |
3046 | | * </pre> |
3047 | | */ |
3048 | | void |
3049 | | l_pdfSetDateAndVersion(l_int32 flag) |
3050 | 0 | { |
3051 | 0 | var_WRITE_DATE_AND_VERSION = flag; |
3052 | 0 | } |
3053 | | |
3054 | | /* --------------------------------------------*/ |
3055 | | #endif /* USE_PDFIO */ |
3056 | | /* --------------------------------------------*/ |