/src/mupdf/source/fitz/output-pdfocr.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2021 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | |
25 | | #include <assert.h> |
26 | | #include <string.h> |
27 | | #include <limits.h> |
28 | | |
29 | | #ifdef OCR_DISABLED |
30 | | |
31 | | /* In non-OCR builds, we need to define this otherwise SWIG Python gets SEGV |
32 | | when it attempts to import mupdf.py and _mupdf.py. */ |
33 | | const char *fz_pdfocr_write_options_usage = ""; |
34 | | |
35 | | #else |
36 | | |
37 | | #include "tessocr.h" |
38 | | |
39 | | const char *fz_pdfocr_write_options_usage = |
40 | | "PDFOCR output options:\n" |
41 | | "\tcompression=none: No compression (default)\n" |
42 | | "\tcompression=flate: Flate compression\n" |
43 | | "\tstrip-height=N: Strip height (default 0=fullpage)\n" |
44 | | "\tocr-language=<lang>: OCR language (default=eng)\n" |
45 | | "\tocr-datadir=<datadir>: OCR data path (default=rely on TESSDATA_PREFIX)\n" |
46 | | "\n"; |
47 | | |
48 | | static const char funky_font[] = |
49 | | "3 0 obj\n<</BaseFont/GlyphLessFont/DescendantFonts[4 0 R]" |
50 | | "/Encoding/Identity-H/Subtype/Type0/ToUnicode 6 0 R/Type/Font" |
51 | | ">>\nendobj\n"; |
52 | | |
53 | | static const char funky_font2[] = |
54 | | "4 0 obj\n" |
55 | | "<</BaseFont/GlyphLessFont/CIDToGIDMap 5 0 R" |
56 | | "/CIDSystemInfo<</Ordering (Identity)/Registry (Adobe)/Supplement 0>>" |
57 | | "/FontDescriptor 7 0 R/Subtype/CIDFontType2/Type/Font/DW 500>>" |
58 | | "\nendobj\n"; |
59 | | |
60 | | static const char funky_font3[] = |
61 | | "5 0 obj\n<</Length 210/Filter/FlateDecode>>\nstream\n" |
62 | | "\x78\x9c\xec\xc2\x01\x09\x00\x00\x00\x02\xa0\xfa\x7f\xba\x21\x89" |
63 | | "\xa6\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
64 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
65 | | "\x80\x7b\x03\x00\x00\xff\xff\xec\xc2\x01\x0d\x00\x00\x00\xc2\x20" |
66 | | "\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
67 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
68 | | "\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xec\xc2\x01\x0d\x00" |
69 | | "\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00" |
70 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
71 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xed" |
72 | | "\xc2\x01\x0d\x00\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00" |
73 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
74 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\xff" |
75 | | "\x00\x10" |
76 | | "\nendstream\nendobj\n"; |
77 | | |
78 | | static const char funky_font4[] = |
79 | | "6 0 obj\n<</Length 353>>\nstream\n" |
80 | | "/CIDInit /ProcSet findresource begin\n" |
81 | | "12 dict begin\n" |
82 | | "begincmap\n" |
83 | | "/CIDSystemInfo\n" |
84 | | "<<\n" |
85 | | " /Registry (Adobe)\n" |
86 | | " /Ordering (UCS)\n" |
87 | | " /Supplement 0\n" |
88 | | ">> def\n" |
89 | | "/CMapName /Adobe-Identity-UCS def\n" |
90 | | "/CMapType 2 def\n" |
91 | | "1 begincodespacerange\n" |
92 | | "<0000> <FFFF>\n" |
93 | | "endcodespacerange\n" |
94 | | "1 beginbfrange\n" |
95 | | "<0000> <FFFF> <0000>\n" |
96 | | "endbfrange\n" |
97 | | "endcmap\n" |
98 | | "CMapName currentdict /CMap defineresource pop\n" |
99 | | "end\n" |
100 | | "end\n" |
101 | | "endstream\n" |
102 | | "endobj\n"; |
103 | | |
104 | | static const char funky_font5[] = |
105 | | "7 0 obj\n" |
106 | | "<</Ascent 1000/CapHeight 1000/Descent -1/Flags 5" |
107 | | "/FontBBox[0 0 500 1000]/FontFile2 8 0 R/FontName/GlyphLessFont" |
108 | | "/ItalicAngle 0/StemV 80/Type/FontDescriptor>>\nendobj\n"; |
109 | | |
110 | | static const char funky_font6[] = |
111 | | "8 0 obj\n<</Length 572/Length1 572>>\nstream\n" |
112 | | "\x00\x01\x00\x00\x00\x0a\x00\x80\x00\x03\x00\x20\x4f\x53\x2f\x32" |
113 | | "\x56\xde\xc8\x94\x00\x00\x01\x28\x00\x00\x00\x60\x63\x6d\x61\x70" |
114 | | "\x00\x0a\x00\x34\x00\x00\x01\x90\x00\x00\x00\x1e\x67\x6c\x79\x66" |
115 | | "\x15\x22\x41\x24\x00\x00\x01\xb8\x00\x00\x00\x18\x68\x65\x61\x64" |
116 | | "\x0b\x78\xf1\x65\x00\x00\x00\xac\x00\x00\x00\x36\x68\x68\x65\x61" |
117 | | "\x0c\x02\x04\x02\x00\x00\x00\xe4\x00\x00\x00\x24\x68\x6d\x74\x78" |
118 | | "\x04\x00\x00\x00\x00\x00\x01\x88\x00\x00\x00\x08\x6c\x6f\x63\x61" |
119 | | "\x00\x0c\x00\x00\x00\x00\x01\xb0\x00\x00\x00\x06\x6d\x61\x78\x70" |
120 | | "\x00\x04\x00\x05\x00\x00\x01\x08\x00\x00\x00\x20\x6e\x61\x6d\x65" |
121 | | "\xf2\xeb\x16\xda\x00\x00\x01\xd0\x00\x00\x00\x4b\x70\x6f\x73\x74" |
122 | | "\x00\x01\x00\x01\x00\x00\x02\x1c\x00\x00\x00\x20\x00\x01\x00\x00" |
123 | | "\x00\x01\x00\x00\xb0\x94\x71\x10\x5f\x0f\x3c\xf5\x04\x07\x08\x00" |
124 | | "\x00\x00\x00\x00\xcf\x9a\xfc\x6e\x00\x00\x00\x00\xd4\xc3\xa7\xf2" |
125 | | "\x00\x00\x00\x00\x04\x00\x08\x00\x00\x00\x00\x10\x00\x02\x00\x00" |
126 | | "\x00\x00\x00\x00\x00\x01\x00\x00\x08\x00\xff\xff\x00\x00\x04\x00" |
127 | | "\x00\x00\x00\x00\x04\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00" |
128 | | "\x00\x00\x00\x00\x00\x00\x00\x02\x00\x01\x00\x00\x00\x02\x00\x04" |
129 | | "\x00\x01\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00" |
130 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x01\x90\x00\x05" |
131 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
132 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x01\x00\x01\x00\x00\x00" |
133 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
134 | | "\x00\x00\x47\x4f\x4f\x47\x00\x40\x00\x00\x00\x00\x00\x01\xff\xff" |
135 | | "\x00\x00\x00\x01\x00\x01\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
136 | | "\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00" |
137 | | "\x00\x00\x00\x02\x00\x01\x00\x00\x00\x00\x00\x14\x00\x03\x00\x00" |
138 | | "\x00\x00\x00\x14\x00\x06\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00" |
139 | | "\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x00\x04\x00" |
140 | | "\x08\x00\x00\x03\x00\x00\x31\x21\x11\x21\x04\x00\xfc\x00\x08\x00" |
141 | | "\x00\x00\x00\x03\x00\x2a\x00\x00\x00\x03\x00\x00\x00\x05\x00\x16" |
142 | | "\x00\x00\x00\x01\x00\x00\x00\x00\x00\x05\x00\x0b\x00\x16\x00\x03" |
143 | | "\x00\x01\x04\x09\x00\x05\x00\x16\x00\x00\x00\x56\x00\x65\x00\x72" |
144 | | "\x00\x73\x00\x69\x00\x6f\x00\x6e\x00\x20\x00\x31\x00\x2e\x00\x30" |
145 | | "\x56\x65\x72\x73\x69\x6f\x6e\x20\x31\x2e\x30\x00\x00\x01\x00\x00" |
146 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00" |
147 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
148 | | "\nendstream\nendobj\n"; |
149 | | |
150 | | #endif |
151 | | |
152 | | fz_pdfocr_options * |
153 | | fz_parse_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts, const char *args) |
154 | 0 | { |
155 | 0 | #ifdef OCR_DISABLED |
156 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
157 | | #else |
158 | | const char *val; |
159 | | |
160 | | memset(opts, 0, sizeof *opts); |
161 | | |
162 | | if (fz_has_option(ctx, args, "compression", &val)) |
163 | | { |
164 | | if (fz_option_eq(val, "none")) |
165 | | opts->compress = 0; |
166 | | else if (fz_option_eq(val, "flate")) |
167 | | opts->compress = 1; |
168 | | else |
169 | | fz_throw(ctx, FZ_ERROR_GENERIC, "Unsupported PDFOCR compression %s (none, or flate only)", val); |
170 | | } |
171 | | if (fz_has_option(ctx, args, "strip-height", &val)) |
172 | | { |
173 | | int i = fz_atoi(val); |
174 | | if (i <= 0) |
175 | | fz_throw(ctx, FZ_ERROR_GENERIC, "Unsupported PDFOCR strip height %d (suggest 0)", i); |
176 | | opts->strip_height = i; |
177 | | } |
178 | | if (fz_has_option(ctx, args, "ocr-language", &val)) |
179 | | { |
180 | | fz_copy_option(ctx, val, opts->language, nelem(opts->language)); |
181 | | } |
182 | | if (fz_has_option(ctx, args, "ocr-datadir", &val)) |
183 | | { |
184 | | fz_copy_option(ctx, val, opts->datadir, nelem(opts->datadir)); |
185 | | } |
186 | | |
187 | | return opts; |
188 | | #endif |
189 | 0 | } |
190 | | |
191 | | void |
192 | | fz_write_pixmap_as_pdfocr(fz_context *ctx, fz_output *out, const fz_pixmap *pixmap, const fz_pdfocr_options *pdfocr) |
193 | 0 | { |
194 | 0 | #ifdef OCR_DISABLED |
195 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
196 | | #else |
197 | | fz_band_writer *writer; |
198 | | |
199 | | if (!pixmap || !out) |
200 | | return; |
201 | | |
202 | | writer = fz_new_pdfocr_band_writer(ctx, out, pdfocr); |
203 | | fz_try(ctx) |
204 | | { |
205 | | fz_write_header(ctx, writer, pixmap->w, pixmap->h, pixmap->n, pixmap->alpha, pixmap->xres, pixmap->yres, 0, pixmap->colorspace, pixmap->seps); |
206 | | fz_write_band(ctx, writer, pixmap->stride, pixmap->h, pixmap->samples); |
207 | | fz_close_band_writer(ctx, writer); |
208 | | } |
209 | | fz_always(ctx) |
210 | | fz_drop_band_writer(ctx, writer); |
211 | | fz_catch(ctx) |
212 | | fz_rethrow(ctx); |
213 | | #endif |
214 | 0 | } |
215 | | |
216 | | #ifndef OCR_DISABLED |
217 | | typedef struct pdfocr_band_writer_s |
218 | | { |
219 | | fz_band_writer super; |
220 | | fz_pdfocr_options options; |
221 | | |
222 | | int obj_num; |
223 | | int xref_max; |
224 | | int64_t *xref; |
225 | | int pages; |
226 | | int page_max; |
227 | | int *page_obj; |
228 | | unsigned char *stripbuf; |
229 | | unsigned char *compbuf; |
230 | | size_t complen; |
231 | | |
232 | | void *tessapi; |
233 | | fz_pixmap *ocrbitmap; |
234 | | |
235 | | fz_pdfocr_progress_fn *progress; |
236 | | void *progress_arg; |
237 | | } pdfocr_band_writer; |
238 | | |
239 | | static int |
240 | | new_obj(fz_context *ctx, pdfocr_band_writer *writer) |
241 | | { |
242 | | int64_t pos = fz_tell_output(ctx, writer->super.out); |
243 | | |
244 | | if (writer->obj_num >= writer->xref_max) |
245 | | { |
246 | | int new_max = writer->xref_max * 2; |
247 | | if (new_max < writer->obj_num + 8) |
248 | | new_max = writer->obj_num + 8; |
249 | | writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t); |
250 | | writer->xref_max = new_max; |
251 | | } |
252 | | |
253 | | writer->xref[writer->obj_num] = pos; |
254 | | |
255 | | return writer->obj_num++; |
256 | | } |
257 | | |
258 | | static void |
259 | | pdfocr_write_header(fz_context *ctx, fz_band_writer *writer_, fz_colorspace *cs) |
260 | | { |
261 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
262 | | fz_output *out = writer->super.out; |
263 | | int w = writer->super.w; |
264 | | int h = writer->super.h; |
265 | | int n = writer->super.n; |
266 | | int s = writer->super.s; |
267 | | int a = writer->super.alpha; |
268 | | int xres = writer->super.xres; |
269 | | int yres = writer->super.yres; |
270 | | int sh = writer->options.strip_height; |
271 | | int strips; |
272 | | int i; |
273 | | |
274 | | if (sh == 0) |
275 | | sh = h; |
276 | | assert(sh != 0 && "pdfocr_write_header() should not be given zero height input."); |
277 | | strips = (h + sh-1)/sh; |
278 | | |
279 | | if (a != 0) |
280 | | fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR cannot write alpha channel"); |
281 | | if (s != 0) |
282 | | fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR cannot write spot colors"); |
283 | | if (n != 3 && n != 1) |
284 | | fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR expected to be Grayscale or RGB"); |
285 | | |
286 | | fz_free(ctx, writer->stripbuf); |
287 | | writer->stripbuf = NULL; |
288 | | fz_free(ctx, writer->compbuf); |
289 | | writer->compbuf = NULL; |
290 | | fz_drop_pixmap(ctx, writer->ocrbitmap); |
291 | | writer->ocrbitmap = NULL; |
292 | | writer->stripbuf = Memento_label(fz_malloc(ctx, (size_t)w * sh * n), "pdfocr_stripbuf"); |
293 | | writer->complen = fz_deflate_bound(ctx, (size_t)w * sh * n); |
294 | | writer->compbuf = Memento_label(fz_malloc(ctx, writer->complen), "pdfocr_compbuf"); |
295 | | /* Always round the width of ocrbitmap up to a multiple of 4. */ |
296 | | writer->ocrbitmap = fz_new_pixmap(ctx, NULL, (w+3)&~3, h, NULL, 0); |
297 | | fz_set_pixmap_resolution(ctx, writer->ocrbitmap, xres, yres); |
298 | | |
299 | | /* Send the file header on the first page */ |
300 | | if (writer->pages == 0) |
301 | | { |
302 | | fz_write_string(ctx, out, "%PDF-1.4\n%PDFOCR-1.0\n"); |
303 | | |
304 | | if (writer->xref_max < 9) |
305 | | { |
306 | | int new_max = 9; |
307 | | writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t); |
308 | | writer->xref_max = new_max; |
309 | | } |
310 | | writer->xref[3] = fz_tell_output(ctx, out); |
311 | | fz_write_data(ctx, out, funky_font, sizeof(funky_font)-1); |
312 | | writer->xref[4] = fz_tell_output(ctx, out); |
313 | | fz_write_data(ctx, out, funky_font2, sizeof(funky_font2)-1); |
314 | | writer->xref[5] = fz_tell_output(ctx, out); |
315 | | fz_write_data(ctx, out, funky_font3, sizeof(funky_font3)-1); |
316 | | writer->xref[6] = fz_tell_output(ctx, out); |
317 | | fz_write_data(ctx, out, funky_font4, sizeof(funky_font4)-1); |
318 | | writer->xref[7] = fz_tell_output(ctx, out); |
319 | | fz_write_data(ctx, out, funky_font5, sizeof(funky_font5)-1); |
320 | | writer->xref[8] = fz_tell_output(ctx, out); |
321 | | fz_write_data(ctx, out, funky_font6, sizeof(funky_font6)-1); |
322 | | } |
323 | | |
324 | | if (writer->page_max <= writer->pages) |
325 | | { |
326 | | int new_max = writer->page_max * 2; |
327 | | if (new_max == 0) |
328 | | new_max = writer->pages + 8; |
329 | | writer->page_obj = fz_realloc_array(ctx, writer->page_obj, new_max, int); |
330 | | writer->page_max = new_max; |
331 | | } |
332 | | writer->page_obj[writer->pages] = writer->obj_num; |
333 | | writer->pages++; |
334 | | |
335 | | /* Send the Page Object */ |
336 | | fz_write_printf(ctx, out, "%d 0 obj\n<</Type/Page/Parent 2 0 R/Resources<</XObject<<", new_obj(ctx, writer)); |
337 | | for (i = 0; i < strips; i++) |
338 | | fz_write_printf(ctx, out, "/I%d %d 0 R", i, writer->obj_num + i); |
339 | | fz_write_printf(ctx, out, ">>/Font<</F0 3 0 R>>>>/MediaBox[0 0 %g %g]/Contents %d 0 R>>\nendobj\n", |
340 | | w * 72.0f / xres, h * 72.0f / yres, writer->obj_num + strips); |
341 | | } |
342 | | |
343 | | static void |
344 | | flush_strip(fz_context *ctx, pdfocr_band_writer *writer, int fill) |
345 | | { |
346 | | unsigned char *data = writer->stripbuf; |
347 | | fz_output *out = writer->super.out; |
348 | | int w = writer->super.w; |
349 | | int n = writer->super.n; |
350 | | size_t len = (size_t)w*n*fill; |
351 | | |
352 | | /* Buffer is full, compress it and write it. */ |
353 | | if (writer->options.compress) |
354 | | { |
355 | | size_t destLen = writer->complen; |
356 | | fz_deflate(ctx, writer->compbuf, &destLen, data, len, FZ_DEFLATE_DEFAULT); |
357 | | len = destLen; |
358 | | data = writer->compbuf; |
359 | | } |
360 | | fz_write_printf(ctx, out, "%d 0 obj\n<</Width %d/ColorSpace/Device%s/Height %d%s/Subtype/Image", |
361 | | new_obj(ctx, writer), w, n == 1 ? "Gray" : "RGB", fill, writer->options.compress ? "/Filter/FlateDecode" : ""); |
362 | | fz_write_printf(ctx, out, "/Length %zd/Type/XObject/BitsPerComponent 8>>\nstream\n", len); |
363 | | fz_write_data(ctx, out, data, len); |
364 | | fz_write_string(ctx, out, "\nendstream\nendobj\n"); |
365 | | } |
366 | | |
367 | | static void |
368 | | pdfocr_write_band(fz_context *ctx, fz_band_writer *writer_, int stride, int band_start, int band_height, const unsigned char *sp) |
369 | | { |
370 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
371 | | fz_output *out = writer->super.out; |
372 | | int w = writer->super.w; |
373 | | int h = writer->super.h; |
374 | | int n = writer->super.n; |
375 | | int sh = writer->options.strip_height; |
376 | | int line; |
377 | | unsigned char *d = writer->ocrbitmap->samples; |
378 | | |
379 | | if (!out) |
380 | | return; |
381 | | |
382 | | if (sh == 0) |
383 | | sh = h; |
384 | | |
385 | | for (line = 0; line < band_height; line++) |
386 | | { |
387 | | int dstline = (band_start+line) % sh; |
388 | | memcpy(writer->stripbuf + (size_t)w*n*dstline, |
389 | | sp + (size_t)line * w * n, |
390 | | (size_t)w * n); |
391 | | if (dstline+1 == sh) |
392 | | flush_strip(ctx, writer, dstline+1); |
393 | | } |
394 | | |
395 | | if (band_start + band_height == h && h % sh != 0) |
396 | | flush_strip(ctx, writer, h % sh); |
397 | | |
398 | | /* Copy strip to ocrbitmap, converting if required. */ |
399 | | d += band_start*w; |
400 | | if (n == 1) |
401 | | { |
402 | | int y; |
403 | | for (y = band_height; y > 0; y--) |
404 | | { |
405 | | memcpy(d, sp, w); |
406 | | if (writer->ocrbitmap->w - w) |
407 | | memset(d + w, 0, writer->ocrbitmap->w - w); |
408 | | d += writer->ocrbitmap->w; |
409 | | } |
410 | | } |
411 | | else |
412 | | { |
413 | | int x, y; |
414 | | for (y = band_height; y > 0; y--) |
415 | | { |
416 | | for (x = w; x > 0; x--) |
417 | | { |
418 | | *d++ = (sp[0] + 2*sp[1] + sp[2] + 2)>>2; |
419 | | sp += 3; |
420 | | } |
421 | | for (x = writer->ocrbitmap->w - w; x > 0; x--) |
422 | | *d++ = 0; |
423 | | } |
424 | | } |
425 | | } |
426 | | |
427 | | enum |
428 | | { |
429 | | WORD_CONTAINS_L2R = 1, |
430 | | WORD_CONTAINS_R2L = 2, |
431 | | WORD_CONTAINS_T2B = 4, |
432 | | WORD_CONTAINS_B2T = 8 |
433 | | }; |
434 | | |
435 | | typedef struct word_t |
436 | | { |
437 | | struct word_t *next; |
438 | | float bbox[4]; |
439 | | int dirn; |
440 | | int len; |
441 | | int chars[1]; |
442 | | } word_t; |
443 | | |
444 | | typedef struct |
445 | | { |
446 | | fz_buffer *buf; |
447 | | pdfocr_band_writer *writer; |
448 | | |
449 | | /* We collate the current word into the following fields: */ |
450 | | int word_max; |
451 | | int word_len; |
452 | | int *word_chars; |
453 | | float word_bbox[4]; |
454 | | int word_dirn; |
455 | | int word_prev_char_bbox[4]; |
456 | | |
457 | | /* When we finish a word, we try to add it to the line. If the |
458 | | * word fits onto the end of the existing line, great. If not, |
459 | | * we flush the entire line, and start a new one just with the |
460 | | * new word. This enables us to output a whole line at once, |
461 | | * which is beneficial to avoid jittering the font sizes |
462 | | * up/down, which looks bad when we try to select text in the |
463 | | * produced PDF. */ |
464 | | word_t *line; |
465 | | word_t **line_tail; |
466 | | float line_bbox[4]; |
467 | | int line_dirn; |
468 | | |
469 | | float cur_size; |
470 | | float cur_scale; |
471 | | float tx, ty; |
472 | | } char_callback_data_t; |
473 | | |
474 | | static void |
475 | | flush_words(fz_context *ctx, char_callback_data_t *cb) |
476 | | { |
477 | | float size; |
478 | | |
479 | | if (cb->line == NULL) |
480 | | return; |
481 | | |
482 | | if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0) |
483 | | { |
484 | | /* Vertical line */ |
485 | | } |
486 | | else |
487 | | { |
488 | | /* Horizontal line */ |
489 | | size = cb->line_bbox[3] - cb->line_bbox[1]; |
490 | | |
491 | | if (size != 0 && size != cb->cur_size) |
492 | | { |
493 | | fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size); |
494 | | cb->cur_size = size; |
495 | | } |
496 | | /* Guard against division by 0. This makes no difference to the |
497 | | * actual calculation as if size is 0, word->bbox[2] == word->bbox[0] |
498 | | * too. */ |
499 | | if (size == 0) |
500 | | size = 1; |
501 | | } |
502 | | |
503 | | while (cb->line) |
504 | | { |
505 | | word_t *word = cb->line; |
506 | | float x, y; |
507 | | int i, len = word->len; |
508 | | float scale; |
509 | | |
510 | | if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0) |
511 | | { |
512 | | /* Contains vertical text. */ |
513 | | size = (word->bbox[3] - word->bbox[1]) / len; |
514 | | if (size == 0) |
515 | | size = 1; |
516 | | if (size != cb->cur_size) |
517 | | { |
518 | | fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size); |
519 | | cb->cur_size = size; |
520 | | } |
521 | | |
522 | | /* Set the scale so that our glyphs fill the line bbox. */ |
523 | | scale = (cb->line_bbox[2] - cb->line_bbox[0]) / size * 200; |
524 | | if (scale != 0) |
525 | | { |
526 | | float letter_height = (word->bbox[3] - word->bbox[1]) / len; |
527 | | |
528 | | if (scale != cb->cur_scale) |
529 | | { |
530 | | fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale); |
531 | | cb->cur_scale = scale; |
532 | | } |
533 | | |
534 | | for (i = 0; i < len; i++) |
535 | | { |
536 | | x = word->bbox[0]; |
537 | | y = word->bbox[1] + letter_height * i; |
538 | | fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); |
539 | | cb->tx = x; |
540 | | cb->ty = y; |
541 | | |
542 | | fz_append_printf(ctx, cb->buf, "<%04x>Tj\n", word->chars[i]); |
543 | | } |
544 | | } |
545 | | } |
546 | | else |
547 | | { |
548 | | scale = (word->bbox[2] - word->bbox[0]) / size / len * 200; |
549 | | if (scale != 0) |
550 | | { |
551 | | if (scale != cb->cur_scale) |
552 | | { |
553 | | fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale); |
554 | | cb->cur_scale = scale; |
555 | | } |
556 | | |
557 | | if ((word->dirn & (WORD_CONTAINS_R2L | WORD_CONTAINS_L2R)) == WORD_CONTAINS_R2L) |
558 | | { |
559 | | /* Purely R2L text */ |
560 | | x = word->bbox[0]; |
561 | | y = cb->line_bbox[1]; |
562 | | fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); |
563 | | cb->tx = x; |
564 | | cb->ty = y; |
565 | | |
566 | | /* Tesseract has sent us R2L text in R2L order (i.e. in Logical order). |
567 | | * We want to output it in that same logical order, but PDF operators |
568 | | * all move the point as if outputting L2R. We can either reverse the |
569 | | * order of chars (bad, because of cut/paste) or we can perform |
570 | | * gymnastics with the position. We opt for the latter. */ |
571 | | fz_append_printf(ctx, cb->buf, "["); |
572 | | for (i = 0; i < len; i++) |
573 | | { |
574 | | if (i == 0) |
575 | | { |
576 | | if (len > 1) |
577 | | fz_append_printf(ctx, cb->buf, "%d", -500*(len-1)); |
578 | | } |
579 | | else |
580 | | fz_append_printf(ctx, cb->buf, "%d", 1000); |
581 | | fz_append_printf(ctx, cb->buf, "<%04x>", word->chars[i]); |
582 | | } |
583 | | fz_append_printf(ctx, cb->buf, "]TJ\n"); |
584 | | } |
585 | | else |
586 | | { |
587 | | /* L2R (or mixed) text */ |
588 | | x = word->bbox[0]; |
589 | | y = cb->line_bbox[1]; |
590 | | fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); |
591 | | cb->tx = x; |
592 | | cb->ty = y; |
593 | | |
594 | | fz_append_printf(ctx, cb->buf, "<"); |
595 | | for (i = 0; i < len; i++) |
596 | | fz_append_printf(ctx, cb->buf, "%04x", word->chars[i]); |
597 | | fz_append_printf(ctx, cb->buf, ">Tj\n"); |
598 | | } |
599 | | } |
600 | | } |
601 | | |
602 | | cb->line = word->next; |
603 | | fz_free(ctx, word); |
604 | | } |
605 | | |
606 | | cb->line_tail = &cb->line; |
607 | | cb->line = NULL; |
608 | | cb->line_dirn = 0; |
609 | | } |
610 | | |
611 | | static void |
612 | | queue_word(fz_context *ctx, char_callback_data_t *cb) |
613 | | { |
614 | | word_t *word; |
615 | | int line_is_v, line_is_h, word_is_v, word_is_h; |
616 | | |
617 | | if (cb->word_len == 0) |
618 | | return; |
619 | | |
620 | | word = fz_malloc(ctx, sizeof(*word) + (cb->word_len-1)*sizeof(int)); |
621 | | word->next = NULL; |
622 | | word->len = cb->word_len; |
623 | | memcpy(word->bbox, cb->word_bbox, 4*sizeof(float)); |
624 | | memcpy(word->chars, cb->word_chars, cb->word_len * sizeof(int)); |
625 | | cb->word_len = 0; |
626 | | |
627 | | line_is_v = !!(cb->line_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B)); |
628 | | word_is_v = !!(cb->word_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B)); |
629 | | line_is_h = !!(cb->line_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L)); |
630 | | word_is_h = !!(cb->word_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L)); |
631 | | |
632 | | word->dirn = cb->word_dirn; |
633 | | cb->word_dirn = 0; |
634 | | |
635 | | /* Can we put the new word onto the end of the existing line? */ |
636 | | if (cb->line != NULL && |
637 | | !line_is_v && !word_is_v && |
638 | | word->bbox[1] <= cb->line_bbox[3] && |
639 | | word->bbox[3] >= cb->line_bbox[1] && |
640 | | (word->bbox[0] >= cb->line_bbox[2] || word->bbox[2] <= cb->line_bbox[0])) |
641 | | { |
642 | | /* Can append (horizontal motion). */ |
643 | | if (word->bbox[0] < cb->line_bbox[0]) |
644 | | cb->line_bbox[0] = word->bbox[0]; |
645 | | if (word->bbox[1] < cb->line_bbox[1]) |
646 | | cb->line_bbox[1] = word->bbox[1]; |
647 | | if (word->bbox[2] > cb->line_bbox[2]) |
648 | | cb->line_bbox[2] = word->bbox[2]; |
649 | | if (word->bbox[3] > cb->line_bbox[3]) |
650 | | cb->line_bbox[3] = word->bbox[3]; |
651 | | } |
652 | | else if (cb->line != NULL && |
653 | | !line_is_h && !word_is_h && |
654 | | word->bbox[0] <= cb->line_bbox[2] && |
655 | | word->bbox[2] >= cb->line_bbox[0] && |
656 | | (word->bbox[1] >= cb->line_bbox[3] || word->bbox[3] <= cb->line_bbox[1])) |
657 | | { |
658 | | /* Can append (vertical motion). */ |
659 | | if (!word_is_v) |
660 | | word->dirn |= WORD_CONTAINS_T2B; |
661 | | if (word->bbox[0] < cb->line_bbox[0]) |
662 | | cb->line_bbox[0] = word->bbox[0]; |
663 | | if (word->bbox[1] < cb->line_bbox[1]) |
664 | | cb->line_bbox[1] = word->bbox[1]; |
665 | | if (word->bbox[2] > cb->line_bbox[2]) |
666 | | cb->line_bbox[2] = word->bbox[2]; |
667 | | if (word->bbox[3] > cb->line_bbox[3]) |
668 | | cb->line_bbox[3] = word->bbox[3]; |
669 | | } |
670 | | else |
671 | | { |
672 | | fz_try(ctx) |
673 | | flush_words(ctx, cb); |
674 | | fz_catch(ctx) |
675 | | { |
676 | | fz_free(ctx, word); |
677 | | fz_rethrow(ctx); |
678 | | } |
679 | | memcpy(cb->line_bbox, word->bbox, 4*sizeof(float)); |
680 | | } |
681 | | |
682 | | *cb->line_tail = word; |
683 | | cb->line_tail = &word->next; |
684 | | cb->line_dirn |= word->dirn; |
685 | | } |
686 | | |
687 | | static void |
688 | | char_callback(fz_context *ctx, void *arg, int unicode, |
689 | | const char *font_name, |
690 | | const int *line_bbox, const int *word_bbox, |
691 | | const int *char_bbox, int pointsize) |
692 | | { |
693 | | char_callback_data_t *cb = (char_callback_data_t *)arg; |
694 | | pdfocr_band_writer *writer = cb->writer; |
695 | | float bbox[4]; |
696 | | |
697 | | bbox[0] = word_bbox[0] * 72.0f / cb->writer->ocrbitmap->xres; |
698 | | bbox[3] = (writer->ocrbitmap->h - 1 - word_bbox[1]) * 72.0f / cb->writer->ocrbitmap->yres; |
699 | | bbox[2] = word_bbox[2] * 72.0f / cb->writer->ocrbitmap->yres; |
700 | | bbox[1] = (writer->ocrbitmap->h - 1 - word_bbox[3]) * 72.0f / cb->writer->ocrbitmap->yres; |
701 | | |
702 | | if (bbox[0] != cb->word_bbox[0] || |
703 | | bbox[1] != cb->word_bbox[1] || |
704 | | bbox[2] != cb->word_bbox[2] || |
705 | | bbox[3] != cb->word_bbox[3]) |
706 | | { |
707 | | queue_word(ctx, cb); |
708 | | memcpy(cb->word_bbox, bbox, 4 * sizeof(float)); |
709 | | } |
710 | | |
711 | | if (cb->word_len == 0) |
712 | | { |
713 | | cb->word_dirn = 0; |
714 | | memcpy(cb->word_prev_char_bbox, char_bbox, 4 * sizeof(int)); |
715 | | } |
716 | | else |
717 | | { |
718 | | int ox = cb->word_prev_char_bbox[0] + cb->word_prev_char_bbox[2]; |
719 | | int oy = cb->word_prev_char_bbox[1] + cb->word_prev_char_bbox[3]; |
720 | | int x = char_bbox[0] + char_bbox[2] - ox; |
721 | | int y = char_bbox[1] + char_bbox[3] - oy; |
722 | | int ax = x < 0 ? -x : x; |
723 | | int ay = y < 0 ? -y : y; |
724 | | if (ax > ay) |
725 | | { |
726 | | if (x > 0) |
727 | | cb->word_dirn |= WORD_CONTAINS_L2R; |
728 | | else if (x < 0) |
729 | | cb->word_dirn |= WORD_CONTAINS_R2L; |
730 | | } |
731 | | else if (ay < ax) |
732 | | { |
733 | | if (y > 0) |
734 | | cb->word_dirn |= WORD_CONTAINS_T2B; |
735 | | else if (y < 0) |
736 | | cb->word_dirn |= WORD_CONTAINS_B2T; |
737 | | } |
738 | | } |
739 | | |
740 | | if (cb->word_max == cb->word_len) |
741 | | { |
742 | | int newmax = cb->word_max * 2; |
743 | | if (newmax == 0) |
744 | | newmax = 16; |
745 | | cb->word_chars = fz_realloc_array(ctx, cb->word_chars, newmax, int); |
746 | | cb->word_max = newmax; |
747 | | } |
748 | | |
749 | | cb->word_chars[cb->word_len++] = unicode; |
750 | | } |
751 | | |
752 | | static int |
753 | | pdfocr_progress(fz_context *ctx, void *arg, int prog) |
754 | | { |
755 | | char_callback_data_t *cb = (char_callback_data_t *)arg; |
756 | | pdfocr_band_writer *writer = cb->writer; |
757 | | |
758 | | if (writer->progress == NULL) |
759 | | return 0; |
760 | | |
761 | | return writer->progress(ctx, writer->progress_arg, writer->pages - 1, prog); |
762 | | } |
763 | | |
764 | | static void |
765 | | pdfocr_write_trailer(fz_context *ctx, fz_band_writer *writer_) |
766 | | { |
767 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
768 | | fz_output *out = writer->super.out; |
769 | | int w = writer->super.w; |
770 | | int h = writer->super.h; |
771 | | int xres = writer->super.xres; |
772 | | int yres = writer->super.yres; |
773 | | int sh = writer->options.strip_height; |
774 | | int strips; |
775 | | int i; |
776 | | size_t len; |
777 | | unsigned char *data; |
778 | | fz_buffer *buf = NULL; |
779 | | char_callback_data_t cb = { NULL }; |
780 | | |
781 | | if (sh == 0) |
782 | | sh = h; |
783 | | strips = (h + sh-1)/sh; |
784 | | |
785 | | /* Send the Page contents */ |
786 | | /* We need the length to this, so write to a buffer first */ |
787 | | fz_var(buf); |
788 | | fz_var(cb); |
789 | | fz_try(ctx) |
790 | | { |
791 | | cb.writer = writer; |
792 | | cb.buf = buf = fz_new_buffer(ctx, 0); |
793 | | cb.line_tail = &cb.line; |
794 | | cb.word_dirn = 0; |
795 | | cb.line_dirn = 0; |
796 | | fz_append_printf(ctx, buf, "q\n%g 0 0 %g 0 0 cm\n", 72.0f/xres, 72.0f/yres); |
797 | | for (i = 0; i < strips; i++) |
798 | | { |
799 | | int at = h - (i+1)*sh; |
800 | | int this_sh = sh; |
801 | | if (at < 0) |
802 | | { |
803 | | this_sh += at; |
804 | | at = 0; |
805 | | } |
806 | | fz_append_printf(ctx, buf, "/P <</MCID 0>> BDC\nq\n%d 0 0 %d 0 %d cm\n/I%d Do\nQ\n", |
807 | | w, this_sh, at, i); |
808 | | } |
809 | | |
810 | | fz_append_printf(ctx, buf, "Q\nBT\n3 Tr\n"); |
811 | | |
812 | | ocr_recognise(ctx, writer->tessapi, writer->ocrbitmap, char_callback, pdfocr_progress, &cb); |
813 | | queue_word(ctx, &cb); |
814 | | flush_words(ctx, &cb); |
815 | | fz_append_printf(ctx, buf, "ET\n"); |
816 | | |
817 | | len = fz_buffer_storage(ctx, buf, &data); |
818 | | fz_write_printf(ctx, out, "%d 0 obj\n<</Length %zd>>\nstream\n", new_obj(ctx, writer), len); |
819 | | fz_write_data(ctx, out, data, len); |
820 | | fz_drop_buffer(ctx, buf); |
821 | | buf = NULL; |
822 | | fz_write_string(ctx, out, "\nendstream\nendobj\n"); |
823 | | } |
824 | | fz_always(ctx) |
825 | | { |
826 | | fz_free(ctx, cb.word_chars); |
827 | | } |
828 | | fz_catch(ctx) |
829 | | { |
830 | | fz_drop_buffer(ctx, buf); |
831 | | fz_rethrow(ctx); |
832 | | } |
833 | | } |
834 | | |
835 | | static void |
836 | | pdfocr_close_band_writer(fz_context *ctx, fz_band_writer *writer_) |
837 | | { |
838 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
839 | | fz_output *out = writer->super.out; |
840 | | int i; |
841 | | |
842 | | /* We actually do the trailer writing in the close */ |
843 | | if (writer->xref_max > 2) |
844 | | { |
845 | | int64_t t_pos; |
846 | | |
847 | | /* Catalog */ |
848 | | writer->xref[1] = fz_tell_output(ctx, out); |
849 | | fz_write_printf(ctx, out, "1 0 obj\n<</Type/Catalog/Pages 2 0 R>>\nendobj\n"); |
850 | | |
851 | | /* Page table */ |
852 | | writer->xref[2] = fz_tell_output(ctx, out); |
853 | | fz_write_printf(ctx, out, "2 0 obj\n<</Count %d/Kids[", writer->pages); |
854 | | |
855 | | for (i = 0; i < writer->pages; i++) |
856 | | { |
857 | | if (i > 0) |
858 | | fz_write_byte(ctx, out, ' '); |
859 | | fz_write_printf(ctx, out, "%d 0 R", writer->page_obj[i]); |
860 | | } |
861 | | fz_write_string(ctx, out, "]/Type/Pages>>\nendobj\n"); |
862 | | |
863 | | /* Xref */ |
864 | | t_pos = fz_tell_output(ctx, out); |
865 | | fz_write_printf(ctx, out, "xref\n0 %d\n0000000000 65535 f \n", writer->obj_num); |
866 | | for (i = 1; i < writer->obj_num; i++) |
867 | | fz_write_printf(ctx, out, "%010ld 00000 n \n", writer->xref[i]); |
868 | | fz_write_printf(ctx, out, "trailer\n<</Size %d/Root 1 0 R>>\nstartxref\n%ld\n%%%%EOF\n", writer->obj_num, t_pos); |
869 | | } |
870 | | } |
871 | | |
872 | | static void |
873 | | pdfocr_drop_band_writer(fz_context *ctx, fz_band_writer *writer_) |
874 | | { |
875 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
876 | | fz_free(ctx, writer->stripbuf); |
877 | | fz_free(ctx, writer->compbuf); |
878 | | fz_free(ctx, writer->page_obj); |
879 | | fz_free(ctx, writer->xref); |
880 | | fz_drop_pixmap(ctx, writer->ocrbitmap); |
881 | | ocr_fin(ctx, writer->tessapi); |
882 | | } |
883 | | #endif |
884 | | |
885 | | fz_band_writer *fz_new_pdfocr_band_writer(fz_context *ctx, fz_output *out, const fz_pdfocr_options *options) |
886 | 0 | { |
887 | 0 | #ifdef OCR_DISABLED |
888 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
889 | | #else |
890 | | pdfocr_band_writer *writer = fz_new_band_writer(ctx, pdfocr_band_writer, out); |
891 | | |
892 | | writer->super.header = pdfocr_write_header; |
893 | | writer->super.band = pdfocr_write_band; |
894 | | writer->super.trailer = pdfocr_write_trailer; |
895 | | writer->super.close = pdfocr_close_band_writer; |
896 | | writer->super.drop = pdfocr_drop_band_writer; |
897 | | |
898 | | if (options) |
899 | | writer->options = *options; |
900 | | else |
901 | | memset(&writer->options, 0, sizeof(writer->options)); |
902 | | |
903 | | /* Objects: |
904 | | * 1 reserved for catalog |
905 | | * 2 for pages tree |
906 | | * 3 font |
907 | | * 4 cidfont |
908 | | * 5 cid to gid map |
909 | | * 6 tounicode |
910 | | * 7 font descriptor |
911 | | * 8 font file |
912 | | */ |
913 | | writer->obj_num = 9; |
914 | | |
915 | | fz_try(ctx) |
916 | | { |
917 | | writer->tessapi = ocr_init(ctx, writer->options.language, writer->options.datadir); |
918 | | } |
919 | | fz_catch(ctx) |
920 | | { |
921 | | fz_drop_band_writer(ctx, &writer->super); |
922 | | fz_throw(ctx, FZ_ERROR_GENERIC, "OCR initialisation failed"); |
923 | | } |
924 | | |
925 | | return &writer->super; |
926 | | #endif |
927 | 0 | } |
928 | | |
929 | | void |
930 | | fz_pdfocr_band_writer_set_progress(fz_context *ctx, fz_band_writer *writer_, fz_pdfocr_progress_fn *progress, void *progress_arg) |
931 | 0 | { |
932 | 0 | #ifdef OCR_DISABLED |
933 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
934 | | #else |
935 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
936 | | if (writer == NULL) |
937 | | return; |
938 | | if (writer->super.header != pdfocr_write_header) |
939 | | fz_throw(ctx, FZ_ERROR_GENERIC, "Not a pdfocr band writer!"); |
940 | | |
941 | | writer->progress = progress; |
942 | | writer->progress_arg = progress_arg; |
943 | | #endif |
944 | 0 | } |
945 | | |
946 | | void |
947 | | fz_save_pixmap_as_pdfocr(fz_context *ctx, fz_pixmap *pixmap, char *filename, int append, const fz_pdfocr_options *pdfocr) |
948 | 0 | { |
949 | 0 | #ifdef OCR_DISABLED |
950 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
951 | | #else |
952 | | fz_output *out = fz_new_output_with_path(ctx, filename, append); |
953 | | fz_try(ctx) |
954 | | { |
955 | | fz_write_pixmap_as_pdfocr(ctx, out, pixmap, pdfocr); |
956 | | fz_close_output(ctx, out); |
957 | | } |
958 | | fz_always(ctx) |
959 | | fz_drop_output(ctx, out); |
960 | | fz_catch(ctx) |
961 | | fz_rethrow(ctx); |
962 | | #endif |
963 | 0 | } |
964 | | |
965 | | /* High-level document writer interface */ |
966 | | |
967 | | #ifndef OCR_DISABLED |
968 | | typedef struct |
969 | | { |
970 | | fz_document_writer super; |
971 | | fz_draw_options draw; |
972 | | fz_pdfocr_options pdfocr; |
973 | | fz_pixmap *pixmap; |
974 | | fz_band_writer *bander; |
975 | | fz_output *out; |
976 | | int pagenum; |
977 | | } fz_pdfocr_writer; |
978 | | |
979 | | static fz_device * |
980 | | pdfocr_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox) |
981 | | { |
982 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
983 | | return fz_new_draw_device_with_options(ctx, &wri->draw, mediabox, &wri->pixmap); |
984 | | } |
985 | | |
986 | | static void |
987 | | pdfocr_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) |
988 | | { |
989 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
990 | | fz_pixmap *pix = wri->pixmap; |
991 | | |
992 | | fz_try(ctx) |
993 | | { |
994 | | fz_close_device(ctx, dev); |
995 | | fz_write_header(ctx, wri->bander, pix->w, pix->h, pix->n, pix->alpha, pix->xres, pix->yres, wri->pagenum++, pix->colorspace, pix->seps); |
996 | | fz_write_band(ctx, wri->bander, pix->stride, pix->h, pix->samples); |
997 | | } |
998 | | fz_always(ctx) |
999 | | { |
1000 | | fz_drop_device(ctx, dev); |
1001 | | fz_drop_pixmap(ctx, pix); |
1002 | | wri->pixmap = NULL; |
1003 | | } |
1004 | | fz_catch(ctx) |
1005 | | fz_rethrow(ctx); |
1006 | | } |
1007 | | |
1008 | | static void |
1009 | | pdfocr_close_writer(fz_context *ctx, fz_document_writer *wri_) |
1010 | | { |
1011 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
1012 | | |
1013 | | fz_close_band_writer(ctx, wri->bander); |
1014 | | fz_close_output(ctx, wri->out); |
1015 | | } |
1016 | | |
1017 | | static void |
1018 | | pdfocr_drop_writer(fz_context *ctx, fz_document_writer *wri_) |
1019 | | { |
1020 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
1021 | | |
1022 | | fz_drop_pixmap(ctx, wri->pixmap); |
1023 | | fz_drop_band_writer(ctx, wri->bander); |
1024 | | fz_drop_output(ctx, wri->out); |
1025 | | } |
1026 | | #endif |
1027 | | |
1028 | | fz_document_writer * |
1029 | | fz_new_pdfocr_writer_with_output(fz_context *ctx, fz_output *out, const char *options) |
1030 | 0 | { |
1031 | 0 | #ifdef OCR_DISABLED |
1032 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
1033 | | #else |
1034 | | fz_pdfocr_writer *wri = NULL; |
1035 | | |
1036 | | fz_var(wri); |
1037 | | |
1038 | | fz_try(ctx) |
1039 | | { |
1040 | | wri = fz_new_derived_document_writer(ctx, fz_pdfocr_writer, pdfocr_begin_page, pdfocr_end_page, pdfocr_close_writer, pdfocr_drop_writer); |
1041 | | fz_parse_draw_options(ctx, &wri->draw, options); |
1042 | | fz_parse_pdfocr_options(ctx, &wri->pdfocr, options); |
1043 | | wri->out = out; |
1044 | | wri->bander = fz_new_pdfocr_band_writer(ctx, wri->out, &wri->pdfocr); |
1045 | | } |
1046 | | fz_catch(ctx) |
1047 | | { |
1048 | | fz_drop_output(ctx, out); |
1049 | | fz_free(ctx, wri); |
1050 | | fz_rethrow(ctx); |
1051 | | } |
1052 | | |
1053 | | return (fz_document_writer*)wri; |
1054 | | #endif |
1055 | 0 | } |
1056 | | |
1057 | | fz_document_writer * |
1058 | | fz_new_pdfocr_writer(fz_context *ctx, const char *path, const char *options) |
1059 | 0 | { |
1060 | 0 | #ifdef OCR_DISABLED |
1061 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
1062 | | #else |
1063 | | fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.pdfocr", 0); |
1064 | | return fz_new_pdfocr_writer_with_output(ctx, out, options); |
1065 | | #endif |
1066 | 0 | } |
1067 | | |
1068 | | void |
1069 | | fz_pdfocr_writer_set_progress(fz_context *ctx, fz_document_writer *writer, fz_pdfocr_progress_fn *progress, void *progress_arg) |
1070 | 0 | { |
1071 | 0 | #ifdef OCR_DISABLED |
1072 | 0 | fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build"); |
1073 | | #else |
1074 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer *)writer; |
1075 | | if (!writer) |
1076 | | return; |
1077 | | if (writer->begin_page != pdfocr_begin_page) |
1078 | | fz_throw(ctx, FZ_ERROR_GENERIC, "Not a pdfocr writer!"); |
1079 | | fz_pdfocr_band_writer_set_progress(ctx, wri->bander, progress, progress_arg); |
1080 | | #endif |
1081 | 0 | } |