/src/mupdf/source/fitz/output-pdfocr.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2025 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | |
25 | | #include <assert.h> |
26 | | #include <string.h> |
27 | | #include <limits.h> |
28 | | |
29 | | #ifdef OCR_DISABLED |
30 | | |
31 | | /* In non-OCR builds, we need to define this otherwise SWIG Python gets SEGV |
32 | | when it attempts to import mupdf.py and _mupdf.py. */ |
33 | | const char *fz_pdfocr_write_options_usage = ""; |
34 | | |
35 | | #else |
36 | | |
37 | | #include "tessocr.h" |
38 | | |
39 | | const char *fz_pdfocr_write_options_usage = |
40 | | "PDFOCR output options:\n" |
41 | | "\tcompression=none: No compression (default)\n" |
42 | | "\tcompression=flate: Flate compression\n" |
43 | | "\tstrip-height=N: Strip height (default 0=fullpage)\n" |
44 | | "\tocr-language=<lang>: OCR language (default=eng)\n" |
45 | | "\tocr-datadir=<datadir>: OCR data path (default=rely on TESSDATA_PREFIX)\n" |
46 | | "\tskew=none,auto,<angle>: Whether to skew correct (default=none).\n" |
47 | | "\tskew-border=increase,maintain,decrease: Size change for border pixels (default=increase).\n" |
48 | | "\n"; |
49 | | |
50 | | static const char funky_font[] = |
51 | | "3 0 obj\n<</BaseFont/GlyphLessFont/DescendantFonts[4 0 R]" |
52 | | "/Encoding/Identity-H/Subtype/Type0/ToUnicode 6 0 R/Type/Font" |
53 | | ">>\nendobj\n"; |
54 | | |
55 | | static const char funky_font2[] = |
56 | | "4 0 obj\n" |
57 | | "<</BaseFont/GlyphLessFont/CIDToGIDMap 5 0 R" |
58 | | "/CIDSystemInfo<</Ordering (Identity)/Registry (Adobe)/Supplement 0>>" |
59 | | "/FontDescriptor 7 0 R/Subtype/CIDFontType2/Type/Font/DW 500>>" |
60 | | "\nendobj\n"; |
61 | | |
62 | | static const char funky_font3[] = |
63 | | "5 0 obj\n<</Length 210/Filter/FlateDecode>>\nstream\n" |
64 | | "\x78\x9c\xec\xc2\x01\x09\x00\x00\x00\x02\xa0\xfa\x7f\xba\x21\x89" |
65 | | "\xa6\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
66 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
67 | | "\x80\x7b\x03\x00\x00\xff\xff\xec\xc2\x01\x0d\x00\x00\x00\xc2\x20" |
68 | | "\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
69 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
70 | | "\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xec\xc2\x01\x0d\x00" |
71 | | "\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00" |
72 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
73 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xed" |
74 | | "\xc2\x01\x0d\x00\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00" |
75 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
76 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\xff" |
77 | | "\x00\x10" |
78 | | "\nendstream\nendobj\n"; |
79 | | |
80 | | static const char funky_font4[] = |
81 | | "6 0 obj\n<</Length 353>>\nstream\n" |
82 | | "/CIDInit /ProcSet findresource begin\n" |
83 | | "12 dict begin\n" |
84 | | "begincmap\n" |
85 | | "/CIDSystemInfo\n" |
86 | | "<<\n" |
87 | | " /Registry (Adobe)\n" |
88 | | " /Ordering (UCS)\n" |
89 | | " /Supplement 0\n" |
90 | | ">> def\n" |
91 | | "/CMapName /Adobe-Identity-UCS def\n" |
92 | | "/CMapType 2 def\n" |
93 | | "1 begincodespacerange\n" |
94 | | "<0000> <FFFF>\n" |
95 | | "endcodespacerange\n" |
96 | | "1 beginbfrange\n" |
97 | | "<0000> <FFFF> <0000>\n" |
98 | | "endbfrange\n" |
99 | | "endcmap\n" |
100 | | "CMapName currentdict /CMap defineresource pop\n" |
101 | | "end\n" |
102 | | "end\n" |
103 | | "endstream\n" |
104 | | "endobj\n"; |
105 | | |
106 | | static const char funky_font5[] = |
107 | | "7 0 obj\n" |
108 | | "<</Ascent 1000/CapHeight 1000/Descent -1/Flags 5" |
109 | | "/FontBBox[0 0 500 1000]/FontFile2 8 0 R/FontName/GlyphLessFont" |
110 | | "/ItalicAngle 0/StemV 80/Type/FontDescriptor>>\nendobj\n"; |
111 | | |
112 | | static const char funky_font6[] = |
113 | | "8 0 obj\n<</Length 572/Length1 572>>\nstream\n" |
114 | | "\x00\x01\x00\x00\x00\x0a\x00\x80\x00\x03\x00\x20\x4f\x53\x2f\x32" |
115 | | "\x56\xde\xc8\x94\x00\x00\x01\x28\x00\x00\x00\x60\x63\x6d\x61\x70" |
116 | | "\x00\x0a\x00\x34\x00\x00\x01\x90\x00\x00\x00\x1e\x67\x6c\x79\x66" |
117 | | "\x15\x22\x41\x24\x00\x00\x01\xb8\x00\x00\x00\x18\x68\x65\x61\x64" |
118 | | "\x0b\x78\xf1\x65\x00\x00\x00\xac\x00\x00\x00\x36\x68\x68\x65\x61" |
119 | | "\x0c\x02\x04\x02\x00\x00\x00\xe4\x00\x00\x00\x24\x68\x6d\x74\x78" |
120 | | "\x04\x00\x00\x00\x00\x00\x01\x88\x00\x00\x00\x08\x6c\x6f\x63\x61" |
121 | | "\x00\x0c\x00\x00\x00\x00\x01\xb0\x00\x00\x00\x06\x6d\x61\x78\x70" |
122 | | "\x00\x04\x00\x05\x00\x00\x01\x08\x00\x00\x00\x20\x6e\x61\x6d\x65" |
123 | | "\xf2\xeb\x16\xda\x00\x00\x01\xd0\x00\x00\x00\x4b\x70\x6f\x73\x74" |
124 | | "\x00\x01\x00\x01\x00\x00\x02\x1c\x00\x00\x00\x20\x00\x01\x00\x00" |
125 | | "\x00\x01\x00\x00\xb0\x94\x71\x10\x5f\x0f\x3c\xf5\x04\x07\x08\x00" |
126 | | "\x00\x00\x00\x00\xcf\x9a\xfc\x6e\x00\x00\x00\x00\xd4\xc3\xa7\xf2" |
127 | | "\x00\x00\x00\x00\x04\x00\x08\x00\x00\x00\x00\x10\x00\x02\x00\x00" |
128 | | "\x00\x00\x00\x00\x00\x01\x00\x00\x08\x00\xff\xff\x00\x00\x04\x00" |
129 | | "\x00\x00\x00\x00\x04\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00" |
130 | | "\x00\x00\x00\x00\x00\x00\x00\x02\x00\x01\x00\x00\x00\x02\x00\x04" |
131 | | "\x00\x01\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00" |
132 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x01\x90\x00\x05" |
133 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
134 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x01\x00\x01\x00\x00\x00" |
135 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
136 | | "\x00\x00\x47\x4f\x4f\x47\x00\x40\x00\x00\x00\x00\x00\x01\xff\xff" |
137 | | "\x00\x00\x00\x01\x00\x01\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
138 | | "\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00" |
139 | | "\x00\x00\x00\x02\x00\x01\x00\x00\x00\x00\x00\x14\x00\x03\x00\x00" |
140 | | "\x00\x00\x00\x14\x00\x06\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00" |
141 | | "\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x00\x04\x00" |
142 | | "\x08\x00\x00\x03\x00\x00\x31\x21\x11\x21\x04\x00\xfc\x00\x08\x00" |
143 | | "\x00\x00\x00\x03\x00\x2a\x00\x00\x00\x03\x00\x00\x00\x05\x00\x16" |
144 | | "\x00\x00\x00\x01\x00\x00\x00\x00\x00\x05\x00\x0b\x00\x16\x00\x03" |
145 | | "\x00\x01\x04\x09\x00\x05\x00\x16\x00\x00\x00\x56\x00\x65\x00\x72" |
146 | | "\x00\x73\x00\x69\x00\x6f\x00\x6e\x00\x20\x00\x31\x00\x2e\x00\x30" |
147 | | "\x56\x65\x72\x73\x69\x6f\x6e\x20\x31\x2e\x30\x00\x00\x01\x00\x00" |
148 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00" |
149 | | "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" |
150 | | "\nendstream\nendobj\n"; |
151 | | |
152 | | #endif |
153 | | |
154 | | fz_pdfocr_options * |
155 | | fz_parse_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts, const char *args) |
156 | 0 | { |
157 | 0 | #ifdef OCR_DISABLED |
158 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
159 | | #else |
160 | | const char *val; |
161 | | |
162 | | memset(opts, 0, sizeof *opts); |
163 | | |
164 | | if (fz_has_option(ctx, args, "compression", &val)) |
165 | | { |
166 | | if (fz_option_eq(val, "none")) |
167 | | opts->compress = 0; |
168 | | else if (fz_option_eq(val, "flate")) |
169 | | opts->compress = 1; |
170 | | else |
171 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported PDFOCR compression %s (none, or flate only)", val); |
172 | | } |
173 | | if (fz_has_option(ctx, args, "strip-height", &val)) |
174 | | { |
175 | | int i = fz_atoi(val); |
176 | | if (i <= 0) |
177 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported PDFOCR strip height %d (suggest 0)", i); |
178 | | opts->strip_height = i; |
179 | | } |
180 | | if (fz_has_option(ctx, args, "ocr-language", &val)) |
181 | | { |
182 | | fz_copy_option(ctx, val, opts->language, nelem(opts->language)); |
183 | | } |
184 | | if (fz_has_option(ctx, args, "ocr-datadir", &val)) |
185 | | { |
186 | | fz_copy_option(ctx, val, opts->datadir, nelem(opts->datadir)); |
187 | | } |
188 | | if (fz_has_option(ctx, args, "skew", &val)) |
189 | | { |
190 | | if (fz_option_eq(val, "auto")) |
191 | | opts->skew_correct = 1; |
192 | | else |
193 | | { |
194 | | opts->skew_correct = 2; |
195 | | opts->skew_angle = fz_atof(val); |
196 | | } |
197 | | } |
198 | | if (fz_has_option(ctx, args, "skew-border", &val)) |
199 | | { |
200 | | if (fz_option_eq(val, "increase")) |
201 | | opts->skew_border = 0; |
202 | | else if (fz_option_eq(val, "maintain")) |
203 | | opts->skew_border = 1; |
204 | | else if (fz_option_eq(val, "decrease")) |
205 | | opts->skew_border = 2; |
206 | | else |
207 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported skew-border option"); |
208 | | } |
209 | | |
210 | | return opts; |
211 | | #endif |
212 | 0 | } |
213 | | |
214 | | void |
215 | | fz_write_pixmap_as_pdfocr(fz_context *ctx, fz_output *out, const fz_pixmap *pixmap, const fz_pdfocr_options *pdfocr) |
216 | 0 | { |
217 | 0 | #ifdef OCR_DISABLED |
218 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
219 | | #else |
220 | | fz_band_writer *writer; |
221 | | |
222 | | if (!pixmap || !out) |
223 | | return; |
224 | | |
225 | | writer = fz_new_pdfocr_band_writer(ctx, out, pdfocr); |
226 | | fz_try(ctx) |
227 | | { |
228 | | fz_write_header(ctx, writer, pixmap->w, pixmap->h, pixmap->n, pixmap->alpha, pixmap->xres, pixmap->yres, 0, pixmap->colorspace, pixmap->seps); |
229 | | fz_write_band(ctx, writer, pixmap->stride, pixmap->h, pixmap->samples); |
230 | | fz_close_band_writer(ctx, writer); |
231 | | } |
232 | | fz_always(ctx) |
233 | | fz_drop_band_writer(ctx, writer); |
234 | | fz_catch(ctx) |
235 | | fz_rethrow(ctx); |
236 | | #endif |
237 | 0 | } |
238 | | |
239 | | #ifndef OCR_DISABLED |
240 | | typedef struct pdfocr_band_writer_s |
241 | | { |
242 | | fz_band_writer super; |
243 | | fz_pdfocr_options options; |
244 | | |
245 | | /* The actual output size */ |
246 | | int deskewed_w; |
247 | | int deskewed_h; |
248 | | |
249 | | int obj_num; |
250 | | int xref_max; |
251 | | int64_t *xref; |
252 | | int pages; |
253 | | int page_max; |
254 | | int *page_obj; |
255 | | unsigned char *stripbuf; |
256 | | unsigned char *compbuf; |
257 | | size_t complen; |
258 | | |
259 | | fz_pixmap *skew_bitmap; |
260 | | |
261 | | void *tessapi; |
262 | | fz_pixmap *ocrbitmap; |
263 | | |
264 | | fz_pdfocr_progress_fn *progress; |
265 | | void *progress_arg; |
266 | | } pdfocr_band_writer; |
267 | | |
268 | | static int |
269 | | new_obj(fz_context *ctx, pdfocr_band_writer *writer) |
270 | | { |
271 | | int64_t pos = fz_tell_output(ctx, writer->super.out); |
272 | | |
273 | | if (writer->obj_num >= writer->xref_max) |
274 | | { |
275 | | int new_max = writer->xref_max * 2; |
276 | | if (new_max < writer->obj_num + 8) |
277 | | new_max = writer->obj_num + 8; |
278 | | writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t); |
279 | | writer->xref_max = new_max; |
280 | | } |
281 | | |
282 | | writer->xref[writer->obj_num] = pos; |
283 | | |
284 | | return writer->obj_num++; |
285 | | } |
286 | | |
287 | | static void |
288 | | post_skew_write_header(fz_context *ctx, pdfocr_band_writer *writer, int w, int h) |
289 | | { |
290 | | fz_output *out = writer->super.out; |
291 | | int xres = writer->super.xres; |
292 | | int yres = writer->super.yres; |
293 | | int sh = writer->options.strip_height; |
294 | | int n = writer->super.n; |
295 | | int strips; |
296 | | int i; |
297 | | |
298 | | if (sh == 0) |
299 | | sh = h; |
300 | | assert(sh != 0 && "pdfocr_write_header() should not be given zero height input."); |
301 | | strips = (h + sh-1)/sh; |
302 | | |
303 | | writer->deskewed_w = w; |
304 | | writer->deskewed_h = h; |
305 | | |
306 | | writer->stripbuf = Memento_label(fz_malloc(ctx, (size_t)w * sh * n), "pdfocr_stripbuf"); |
307 | | writer->complen = fz_deflate_bound(ctx, (size_t)w * sh * n); |
308 | | writer->compbuf = Memento_label(fz_malloc(ctx, writer->complen), "pdfocr_compbuf"); |
309 | | |
310 | | /* Always round the width of ocrbitmap up to a multiple of 4. */ |
311 | | writer->ocrbitmap = fz_new_pixmap(ctx, NULL, (w+3)&~3, h, NULL, 0); |
312 | | fz_set_pixmap_resolution(ctx, writer->ocrbitmap, xres, yres); |
313 | | |
314 | | /* Send the Page Object */ |
315 | | fz_write_printf(ctx, out, "%d 0 obj\n<</Type/Page/Parent 2 0 R/Resources<</XObject<<", new_obj(ctx, writer)); |
316 | | for (i = 0; i < strips; i++) |
317 | | fz_write_printf(ctx, out, "/I%d %d 0 R", i, writer->obj_num + i); |
318 | | fz_write_printf(ctx, out, ">>/Font<</F0 3 0 R>>>>/MediaBox[0 0 %g %g]/Contents %d 0 R>>\nendobj\n", |
319 | | w * 72.0f / xres, h * 72.0f / yres, writer->obj_num + strips); |
320 | | } |
321 | | |
322 | | static void |
323 | | pdfocr_write_header(fz_context *ctx, fz_band_writer *writer_, fz_colorspace *cs) |
324 | | { |
325 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
326 | | fz_output *out = writer->super.out; |
327 | | int w = writer->super.w; |
328 | | int h = writer->super.h; |
329 | | int n = writer->super.n; |
330 | | int s = writer->super.s; |
331 | | int a = writer->super.alpha; |
332 | | int sh = writer->options.strip_height; |
333 | | |
334 | | if (sh == 0) |
335 | | sh = h; |
336 | | assert(sh != 0 && "pdfocr_write_header() should not be given zero height input."); |
337 | | |
338 | | if (a != 0) |
339 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR cannot write alpha channel"); |
340 | | if (s != 0) |
341 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR cannot write spot colors"); |
342 | | if (n != 3 && n != 1) |
343 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR expected to be Grayscale or RGB"); |
344 | | |
345 | | fz_free(ctx, writer->stripbuf); |
346 | | writer->stripbuf = NULL; |
347 | | fz_free(ctx, writer->compbuf); |
348 | | writer->compbuf = NULL; |
349 | | fz_drop_pixmap(ctx, writer->ocrbitmap); |
350 | | writer->ocrbitmap = NULL; |
351 | | |
352 | | /* Send the file header on the first page */ |
353 | | if (writer->pages == 0) |
354 | | { |
355 | | fz_write_string(ctx, out, "%PDF-1.4\n%PDFOCR-1.0\n"); |
356 | | |
357 | | if (writer->xref_max < 9) |
358 | | { |
359 | | int new_max = 9; |
360 | | writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t); |
361 | | writer->xref_max = new_max; |
362 | | } |
363 | | writer->xref[3] = fz_tell_output(ctx, out); |
364 | | fz_write_data(ctx, out, funky_font, sizeof(funky_font)-1); |
365 | | writer->xref[4] = fz_tell_output(ctx, out); |
366 | | fz_write_data(ctx, out, funky_font2, sizeof(funky_font2)-1); |
367 | | writer->xref[5] = fz_tell_output(ctx, out); |
368 | | fz_write_data(ctx, out, funky_font3, sizeof(funky_font3)-1); |
369 | | writer->xref[6] = fz_tell_output(ctx, out); |
370 | | fz_write_data(ctx, out, funky_font4, sizeof(funky_font4)-1); |
371 | | writer->xref[7] = fz_tell_output(ctx, out); |
372 | | fz_write_data(ctx, out, funky_font5, sizeof(funky_font5)-1); |
373 | | writer->xref[8] = fz_tell_output(ctx, out); |
374 | | fz_write_data(ctx, out, funky_font6, sizeof(funky_font6)-1); |
375 | | } |
376 | | |
377 | | if (writer->page_max <= writer->pages) |
378 | | { |
379 | | int new_max = writer->page_max * 2; |
380 | | if (new_max == 0) |
381 | | new_max = writer->pages + 8; |
382 | | writer->page_obj = fz_realloc_array(ctx, writer->page_obj, new_max, int); |
383 | | writer->page_max = new_max; |
384 | | } |
385 | | writer->page_obj[writer->pages] = writer->obj_num; |
386 | | writer->pages++; |
387 | | |
388 | | if (writer->options.skew_correct) |
389 | | writer->skew_bitmap = fz_new_pixmap(ctx, n == 3 ? fz_device_rgb(ctx) : fz_device_gray(ctx), w, h, NULL, 0); |
390 | | else |
391 | | post_skew_write_header(ctx, writer, w, h); |
392 | | } |
393 | | |
394 | | static void |
395 | | flush_strip(fz_context *ctx, pdfocr_band_writer *writer, int fill) |
396 | | { |
397 | | unsigned char *data = writer->stripbuf; |
398 | | fz_output *out = writer->super.out; |
399 | | int w = writer->deskewed_w; |
400 | | int n = writer->super.n; |
401 | | size_t len = (size_t)w*n*fill; |
402 | | |
403 | | /* Buffer is full, compress it and write it. */ |
404 | | if (writer->options.compress) |
405 | | { |
406 | | size_t destLen = writer->complen; |
407 | | fz_deflate(ctx, writer->compbuf, &destLen, data, len, FZ_DEFLATE_DEFAULT); |
408 | | len = destLen; |
409 | | data = writer->compbuf; |
410 | | } |
411 | | fz_write_printf(ctx, out, "%d 0 obj\n<</Width %d/ColorSpace/Device%s/Height %d%s/Subtype/Image", |
412 | | new_obj(ctx, writer), w, n == 1 ? "Gray" : "RGB", fill, writer->options.compress ? "/Filter/FlateDecode" : ""); |
413 | | fz_write_printf(ctx, out, "/Length %zd/Type/XObject/BitsPerComponent 8>>\nstream\n", len); |
414 | | fz_write_data(ctx, out, data, len); |
415 | | fz_write_string(ctx, out, "\nendstream\nendobj\n"); |
416 | | } |
417 | | |
418 | | static void |
419 | | post_skew_write_band(fz_context *ctx, pdfocr_band_writer *writer, int stride, int band_start, int band_height, const unsigned char *sp) |
420 | | { |
421 | | int w = writer->deskewed_w; |
422 | | int h = writer->deskewed_h; |
423 | | int n = writer->super.n; |
424 | | int x, y; |
425 | | int sh = writer->options.strip_height; |
426 | | int line; |
427 | | unsigned char *d; |
428 | | |
429 | | if (sh == 0) |
430 | | sh = h; |
431 | | |
432 | | for (line = 0; line < band_height; line++) |
433 | | { |
434 | | int dstline = (band_start+line) % sh; |
435 | | memcpy(writer->stripbuf + (size_t)w*n*dstline, |
436 | | sp + (size_t)line * w * n, |
437 | | (size_t)w * n); |
438 | | if (dstline+1 == sh) |
439 | | flush_strip(ctx, writer, dstline+1); |
440 | | } |
441 | | if (band_start + band_height == h && h % sh != 0) |
442 | | flush_strip(ctx, writer, h % sh); |
443 | | |
444 | | /* Copy strip to ocrbitmap, converting if required. */ |
445 | | d = writer->ocrbitmap->samples; |
446 | | d += band_start*w; |
447 | | if (n == 1) |
448 | | { |
449 | | for (y = band_height; y > 0; y--) |
450 | | { |
451 | | memcpy(d, sp, w); |
452 | | if (writer->ocrbitmap->w - w) |
453 | | memset(d + w, 0, writer->ocrbitmap->w - w); |
454 | | d += writer->ocrbitmap->w; |
455 | | } |
456 | | } |
457 | | else |
458 | | { |
459 | | for (y = band_height; y > 0; y--) |
460 | | { |
461 | | for (x = w; x > 0; x--) |
462 | | { |
463 | | *d++ = (sp[0] + 2*sp[1] + sp[2] + 2)>>2; |
464 | | sp += 3; |
465 | | } |
466 | | for (x = writer->ocrbitmap->w - w; x > 0; x--) |
467 | | *d++ = 0; |
468 | | } |
469 | | } |
470 | | } |
471 | | |
472 | | static void |
473 | | pdfocr_write_band(fz_context *ctx, fz_band_writer *writer_, int stride, int band_start, int band_height, const unsigned char *sp) |
474 | | { |
475 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
476 | | fz_output *out = writer->super.out; |
477 | | int w = writer->super.w; |
478 | | int n = writer->super.n; |
479 | | unsigned char *d; |
480 | | |
481 | | if (!out) |
482 | | return; |
483 | | |
484 | | if (writer->skew_bitmap) |
485 | | { |
486 | | d = writer->skew_bitmap->samples; |
487 | | d += band_start*w*n; |
488 | | memcpy(d, sp, w*n*band_height); |
489 | | } |
490 | | else |
491 | | post_skew_write_band(ctx, writer, stride, band_start, band_height, sp); |
492 | | } |
493 | | |
494 | | enum |
495 | | { |
496 | | WORD_CONTAINS_L2R = 1, |
497 | | WORD_CONTAINS_R2L = 2, |
498 | | WORD_CONTAINS_T2B = 4, |
499 | | WORD_CONTAINS_B2T = 8 |
500 | | }; |
501 | | |
502 | | typedef struct word_t |
503 | | { |
504 | | struct word_t *next; |
505 | | float bbox[4]; |
506 | | int dirn; |
507 | | int len; |
508 | | int chars[FZ_FLEXIBLE_ARRAY]; |
509 | | } word_t; |
510 | | |
511 | | typedef struct |
512 | | { |
513 | | fz_buffer *buf; |
514 | | pdfocr_band_writer *writer; |
515 | | |
516 | | /* We collate the current word into the following fields: */ |
517 | | int word_max; |
518 | | int word_len; |
519 | | int *word_chars; |
520 | | float word_bbox[4]; |
521 | | int word_dirn; |
522 | | int word_prev_char_bbox[4]; |
523 | | |
524 | | /* When we finish a word, we try to add it to the line. If the |
525 | | * word fits onto the end of the existing line, great. If not, |
526 | | * we flush the entire line, and start a new one just with the |
527 | | * new word. This enables us to output a whole line at once, |
528 | | * which is beneficial to avoid jittering the font sizes |
529 | | * up/down, which looks bad when we try to select text in the |
530 | | * produced PDF. */ |
531 | | word_t *line; |
532 | | word_t **line_tail; |
533 | | float line_bbox[4]; |
534 | | int line_dirn; |
535 | | |
536 | | float cur_size; |
537 | | float cur_scale; |
538 | | float tx, ty; |
539 | | } char_callback_data_t; |
540 | | |
541 | | static void |
542 | | flush_words(fz_context *ctx, char_callback_data_t *cb) |
543 | | { |
544 | | float size; |
545 | | |
546 | | if (cb->line == NULL) |
547 | | return; |
548 | | |
549 | | if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0) |
550 | | { |
551 | | /* Vertical line */ |
552 | | } |
553 | | else |
554 | | { |
555 | | /* Horizontal line */ |
556 | | size = cb->line_bbox[3] - cb->line_bbox[1]; |
557 | | |
558 | | if (size != 0 && size != cb->cur_size) |
559 | | { |
560 | | fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size); |
561 | | cb->cur_size = size; |
562 | | } |
563 | | /* Guard against division by 0. This makes no difference to the |
564 | | * actual calculation as if size is 0, word->bbox[2] == word->bbox[0] |
565 | | * too. */ |
566 | | if (size == 0) |
567 | | size = 1; |
568 | | } |
569 | | |
570 | | while (cb->line) |
571 | | { |
572 | | word_t *word = cb->line; |
573 | | float x, y; |
574 | | int i, len = word->len; |
575 | | float scale; |
576 | | |
577 | | if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0) |
578 | | { |
579 | | /* Contains vertical text. */ |
580 | | size = (word->bbox[3] - word->bbox[1]) / len; |
581 | | if (size == 0) |
582 | | size = 1; |
583 | | if (size != cb->cur_size) |
584 | | { |
585 | | fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size); |
586 | | cb->cur_size = size; |
587 | | } |
588 | | |
589 | | /* Set the scale so that our glyphs fill the line bbox. */ |
590 | | scale = (cb->line_bbox[2] - cb->line_bbox[0]) / size * 200; |
591 | | if (scale != 0) |
592 | | { |
593 | | float letter_height = (word->bbox[3] - word->bbox[1]) / len; |
594 | | |
595 | | if (scale != cb->cur_scale) |
596 | | { |
597 | | fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale); |
598 | | cb->cur_scale = scale; |
599 | | } |
600 | | |
601 | | for (i = 0; i < len; i++) |
602 | | { |
603 | | x = word->bbox[0]; |
604 | | y = word->bbox[1] + letter_height * i; |
605 | | fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); |
606 | | cb->tx = x; |
607 | | cb->ty = y; |
608 | | |
609 | | fz_append_printf(ctx, cb->buf, "<%04x>Tj\n", word->chars[i]); |
610 | | } |
611 | | } |
612 | | } |
613 | | else |
614 | | { |
615 | | scale = (word->bbox[2] - word->bbox[0]) / size / len * 200; |
616 | | if (scale != 0) |
617 | | { |
618 | | if (scale != cb->cur_scale) |
619 | | { |
620 | | fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale); |
621 | | cb->cur_scale = scale; |
622 | | } |
623 | | |
624 | | if ((word->dirn & (WORD_CONTAINS_R2L | WORD_CONTAINS_L2R)) == WORD_CONTAINS_R2L) |
625 | | { |
626 | | /* Purely R2L text */ |
627 | | x = word->bbox[0]; |
628 | | y = cb->line_bbox[1]; |
629 | | fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); |
630 | | cb->tx = x; |
631 | | cb->ty = y; |
632 | | |
633 | | /* Tesseract has sent us R2L text in R2L order (i.e. in Logical order). |
634 | | * We want to output it in that same logical order, but PDF operators |
635 | | * all move the point as if outputting L2R. We can either reverse the |
636 | | * order of chars (bad, because of cut/paste) or we can perform |
637 | | * gymnastics with the position. We opt for the latter. */ |
638 | | fz_append_printf(ctx, cb->buf, "["); |
639 | | for (i = 0; i < len; i++) |
640 | | { |
641 | | if (i == 0) |
642 | | { |
643 | | if (len > 1) |
644 | | fz_append_printf(ctx, cb->buf, "%d", -500*(len-1)); |
645 | | } |
646 | | else |
647 | | fz_append_printf(ctx, cb->buf, "%d", 1000); |
648 | | fz_append_printf(ctx, cb->buf, "<%04x>", word->chars[i]); |
649 | | } |
650 | | fz_append_printf(ctx, cb->buf, "]TJ\n"); |
651 | | } |
652 | | else |
653 | | { |
654 | | /* L2R (or mixed) text */ |
655 | | x = word->bbox[0]; |
656 | | y = cb->line_bbox[1]; |
657 | | fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty); |
658 | | cb->tx = x; |
659 | | cb->ty = y; |
660 | | |
661 | | fz_append_printf(ctx, cb->buf, "<"); |
662 | | for (i = 0; i < len; i++) |
663 | | fz_append_printf(ctx, cb->buf, "%04x", word->chars[i]); |
664 | | fz_append_printf(ctx, cb->buf, ">Tj\n"); |
665 | | } |
666 | | } |
667 | | } |
668 | | |
669 | | cb->line = word->next; |
670 | | fz_free(ctx, word); |
671 | | } |
672 | | |
673 | | cb->line_tail = &cb->line; |
674 | | cb->line = NULL; |
675 | | cb->line_dirn = 0; |
676 | | } |
677 | | |
678 | | static void |
679 | | queue_word(fz_context *ctx, char_callback_data_t *cb) |
680 | | { |
681 | | word_t *word; |
682 | | int line_is_v, line_is_h, word_is_v, word_is_h; |
683 | | |
684 | | if (cb->word_len == 0) |
685 | | return; |
686 | | |
687 | | word = fz_malloc_flexible(ctx, word_t, chars, cb->word_len); |
688 | | word->next = NULL; |
689 | | word->len = cb->word_len; |
690 | | memcpy(word->bbox, cb->word_bbox, 4*sizeof(float)); |
691 | | memcpy(word->chars, cb->word_chars, cb->word_len * sizeof(int)); |
692 | | cb->word_len = 0; |
693 | | |
694 | | line_is_v = !!(cb->line_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B)); |
695 | | word_is_v = !!(cb->word_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B)); |
696 | | line_is_h = !!(cb->line_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L)); |
697 | | word_is_h = !!(cb->word_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L)); |
698 | | |
699 | | word->dirn = cb->word_dirn; |
700 | | cb->word_dirn = 0; |
701 | | |
702 | | /* Can we put the new word onto the end of the existing line? */ |
703 | | if (cb->line != NULL && |
704 | | !line_is_v && !word_is_v && |
705 | | word->bbox[1] <= cb->line_bbox[3] && |
706 | | word->bbox[3] >= cb->line_bbox[1] && |
707 | | (word->bbox[0] >= cb->line_bbox[2] || word->bbox[2] <= cb->line_bbox[0])) |
708 | | { |
709 | | /* Can append (horizontal motion). */ |
710 | | if (word->bbox[0] < cb->line_bbox[0]) |
711 | | cb->line_bbox[0] = word->bbox[0]; |
712 | | if (word->bbox[1] < cb->line_bbox[1]) |
713 | | cb->line_bbox[1] = word->bbox[1]; |
714 | | if (word->bbox[2] > cb->line_bbox[2]) |
715 | | cb->line_bbox[2] = word->bbox[2]; |
716 | | if (word->bbox[3] > cb->line_bbox[3]) |
717 | | cb->line_bbox[3] = word->bbox[3]; |
718 | | } |
719 | | else if (cb->line != NULL && |
720 | | !line_is_h && !word_is_h && |
721 | | word->bbox[0] <= cb->line_bbox[2] && |
722 | | word->bbox[2] >= cb->line_bbox[0] && |
723 | | (word->bbox[1] >= cb->line_bbox[3] || word->bbox[3] <= cb->line_bbox[1])) |
724 | | { |
725 | | /* Can append (vertical motion). */ |
726 | | if (!word_is_v) |
727 | | word->dirn |= WORD_CONTAINS_T2B; |
728 | | if (word->bbox[0] < cb->line_bbox[0]) |
729 | | cb->line_bbox[0] = word->bbox[0]; |
730 | | if (word->bbox[1] < cb->line_bbox[1]) |
731 | | cb->line_bbox[1] = word->bbox[1]; |
732 | | if (word->bbox[2] > cb->line_bbox[2]) |
733 | | cb->line_bbox[2] = word->bbox[2]; |
734 | | if (word->bbox[3] > cb->line_bbox[3]) |
735 | | cb->line_bbox[3] = word->bbox[3]; |
736 | | } |
737 | | else |
738 | | { |
739 | | fz_try(ctx) |
740 | | flush_words(ctx, cb); |
741 | | fz_catch(ctx) |
742 | | { |
743 | | fz_free(ctx, word); |
744 | | fz_rethrow(ctx); |
745 | | } |
746 | | memcpy(cb->line_bbox, word->bbox, 4*sizeof(float)); |
747 | | } |
748 | | |
749 | | *cb->line_tail = word; |
750 | | cb->line_tail = &word->next; |
751 | | cb->line_dirn |= word->dirn; |
752 | | } |
753 | | |
754 | | static void |
755 | | char_callback(fz_context *ctx, void *arg, int unicode, |
756 | | const char *font_name, |
757 | | const int *line_bbox, const int *word_bbox, |
758 | | const int *char_bbox, int pointsize) |
759 | | { |
760 | | char_callback_data_t *cb = (char_callback_data_t *)arg; |
761 | | pdfocr_band_writer *writer = cb->writer; |
762 | | float bbox[4]; |
763 | | |
764 | | bbox[0] = word_bbox[0] * 72.0f / cb->writer->ocrbitmap->xres; |
765 | | bbox[3] = (writer->ocrbitmap->h - 1 - word_bbox[1]) * 72.0f / cb->writer->ocrbitmap->yres; |
766 | | bbox[2] = word_bbox[2] * 72.0f / cb->writer->ocrbitmap->yres; |
767 | | bbox[1] = (writer->ocrbitmap->h - 1 - word_bbox[3]) * 72.0f / cb->writer->ocrbitmap->yres; |
768 | | |
769 | | if (bbox[0] != cb->word_bbox[0] || |
770 | | bbox[1] != cb->word_bbox[1] || |
771 | | bbox[2] != cb->word_bbox[2] || |
772 | | bbox[3] != cb->word_bbox[3]) |
773 | | { |
774 | | queue_word(ctx, cb); |
775 | | memcpy(cb->word_bbox, bbox, 4 * sizeof(float)); |
776 | | } |
777 | | |
778 | | if (cb->word_len == 0) |
779 | | { |
780 | | cb->word_dirn = 0; |
781 | | memcpy(cb->word_prev_char_bbox, char_bbox, 4 * sizeof(int)); |
782 | | } |
783 | | else |
784 | | { |
785 | | int ox = cb->word_prev_char_bbox[0] + cb->word_prev_char_bbox[2]; |
786 | | int oy = cb->word_prev_char_bbox[1] + cb->word_prev_char_bbox[3]; |
787 | | int x = char_bbox[0] + char_bbox[2] - ox; |
788 | | int y = char_bbox[1] + char_bbox[3] - oy; |
789 | | int ax = x < 0 ? -x : x; |
790 | | int ay = y < 0 ? -y : y; |
791 | | if (ax > ay) |
792 | | { |
793 | | if (x > 0) |
794 | | cb->word_dirn |= WORD_CONTAINS_L2R; |
795 | | else if (x < 0) |
796 | | cb->word_dirn |= WORD_CONTAINS_R2L; |
797 | | } |
798 | | else if (ay < ax) |
799 | | { |
800 | | if (y > 0) |
801 | | cb->word_dirn |= WORD_CONTAINS_T2B; |
802 | | else if (y < 0) |
803 | | cb->word_dirn |= WORD_CONTAINS_B2T; |
804 | | } |
805 | | } |
806 | | |
807 | | if (cb->word_max == cb->word_len) |
808 | | { |
809 | | int newmax = cb->word_max * 2; |
810 | | if (newmax == 0) |
811 | | newmax = 16; |
812 | | cb->word_chars = fz_realloc_array(ctx, cb->word_chars, newmax, int); |
813 | | cb->word_max = newmax; |
814 | | } |
815 | | |
816 | | cb->word_chars[cb->word_len++] = unicode; |
817 | | } |
818 | | |
819 | | static int |
820 | | pdfocr_progress(fz_context *ctx, void *arg, int prog) |
821 | | { |
822 | | char_callback_data_t *cb = (char_callback_data_t *)arg; |
823 | | pdfocr_band_writer *writer = cb->writer; |
824 | | |
825 | | if (writer->progress == NULL) |
826 | | return 0; |
827 | | |
828 | | return writer->progress(ctx, writer->progress_arg, writer->pages - 1, prog); |
829 | | } |
830 | | |
831 | | static void |
832 | | do_skew_correct(fz_context *ctx, pdfocr_band_writer *writer) |
833 | | { |
834 | | fz_pixmap *deskewed; |
835 | | |
836 | | if (writer->options.skew_correct == 1) |
837 | | writer->options.skew_angle = fz_detect_skew(ctx, writer->skew_bitmap); |
838 | | |
839 | | deskewed = fz_deskew_pixmap(ctx, writer->skew_bitmap, writer->options.skew_angle, writer->options.skew_border); |
840 | | |
841 | | fz_try(ctx) |
842 | | { |
843 | | post_skew_write_header(ctx, writer, deskewed->w, deskewed->h); |
844 | | post_skew_write_band(ctx, writer, deskewed->stride, 0, deskewed->h, deskewed->samples); |
845 | | } |
846 | | fz_always(ctx) |
847 | | fz_drop_pixmap(ctx, deskewed); |
848 | | fz_catch(ctx) |
849 | | fz_rethrow(ctx); |
850 | | } |
851 | | |
852 | | static void |
853 | | pdfocr_write_trailer(fz_context *ctx, fz_band_writer *writer_) |
854 | | { |
855 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
856 | | fz_output *out = writer->super.out; |
857 | | int xres = writer->super.xres; |
858 | | int yres = writer->super.yres; |
859 | | int sh = writer->options.strip_height; |
860 | | int strips; |
861 | | int w, h, i; |
862 | | size_t len; |
863 | | unsigned char *data; |
864 | | fz_buffer *buf = NULL; |
865 | | char_callback_data_t cb = { NULL }; |
866 | | |
867 | | if (writer->options.skew_correct) |
868 | | do_skew_correct(ctx, writer); |
869 | | |
870 | | w = writer->deskewed_w; |
871 | | h = writer->deskewed_h; |
872 | | if (sh == 0) |
873 | | sh = h; |
874 | | strips = (h + sh-1)/sh; |
875 | | |
876 | | /* Send the Page contents */ |
877 | | /* We need the length to this, so write to a buffer first */ |
878 | | fz_var(buf); |
879 | | fz_var(cb); |
880 | | fz_try(ctx) |
881 | | { |
882 | | cb.writer = writer; |
883 | | cb.buf = buf = fz_new_buffer(ctx, 0); |
884 | | cb.line_tail = &cb.line; |
885 | | cb.word_dirn = 0; |
886 | | cb.line_dirn = 0; |
887 | | fz_append_printf(ctx, buf, "q\n%g 0 0 %g 0 0 cm\n", 72.0f/xres, 72.0f/yres); |
888 | | for (i = 0; i < strips; i++) |
889 | | { |
890 | | int at = h - (i+1)*sh; |
891 | | int this_sh = sh; |
892 | | if (at < 0) |
893 | | { |
894 | | this_sh += at; |
895 | | at = 0; |
896 | | } |
897 | | fz_append_printf(ctx, buf, "/P <</MCID 0>> BDC\nq\n%d 0 0 %d 0 %d cm\n/I%d Do\nQ\n", |
898 | | w, this_sh, at, i); |
899 | | } |
900 | | |
901 | | fz_append_printf(ctx, buf, "Q\nBT\n3 Tr\n"); |
902 | | |
903 | | ocr_recognise(ctx, writer->tessapi, writer->ocrbitmap, char_callback, pdfocr_progress, &cb); |
904 | | queue_word(ctx, &cb); |
905 | | flush_words(ctx, &cb); |
906 | | fz_append_printf(ctx, buf, "ET\n"); |
907 | | |
908 | | len = fz_buffer_storage(ctx, buf, &data); |
909 | | fz_write_printf(ctx, out, "%d 0 obj\n<</Length %zd>>\nstream\n", new_obj(ctx, writer), len); |
910 | | fz_write_data(ctx, out, data, len); |
911 | | fz_drop_buffer(ctx, buf); |
912 | | buf = NULL; |
913 | | fz_write_string(ctx, out, "\nendstream\nendobj\n"); |
914 | | } |
915 | | fz_always(ctx) |
916 | | { |
917 | | fz_free(ctx, cb.word_chars); |
918 | | } |
919 | | fz_catch(ctx) |
920 | | { |
921 | | fz_drop_buffer(ctx, buf); |
922 | | fz_rethrow(ctx); |
923 | | } |
924 | | } |
925 | | |
926 | | static void |
927 | | pdfocr_close_band_writer(fz_context *ctx, fz_band_writer *writer_) |
928 | | { |
929 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
930 | | fz_output *out = writer->super.out; |
931 | | int i; |
932 | | |
933 | | /* We actually do the trailer writing in the close */ |
934 | | if (writer->xref_max > 2) |
935 | | { |
936 | | int64_t t_pos; |
937 | | |
938 | | /* Catalog */ |
939 | | writer->xref[1] = fz_tell_output(ctx, out); |
940 | | fz_write_printf(ctx, out, "1 0 obj\n<</Type/Catalog/Pages 2 0 R>>\nendobj\n"); |
941 | | |
942 | | /* Page table */ |
943 | | writer->xref[2] = fz_tell_output(ctx, out); |
944 | | fz_write_printf(ctx, out, "2 0 obj\n<</Count %d/Kids[", writer->pages); |
945 | | |
946 | | for (i = 0; i < writer->pages; i++) |
947 | | { |
948 | | if (i > 0) |
949 | | fz_write_byte(ctx, out, ' '); |
950 | | fz_write_printf(ctx, out, "%d 0 R", writer->page_obj[i]); |
951 | | } |
952 | | fz_write_string(ctx, out, "]/Type/Pages>>\nendobj\n"); |
953 | | |
954 | | /* Xref */ |
955 | | t_pos = fz_tell_output(ctx, out); |
956 | | fz_write_printf(ctx, out, "xref\n0 %d\n0000000000 65535 f \n", writer->obj_num); |
957 | | for (i = 1; i < writer->obj_num; i++) |
958 | | fz_write_printf(ctx, out, "%010ld 00000 n \n", writer->xref[i]); |
959 | | fz_write_printf(ctx, out, "trailer\n<</Size %d/Root 1 0 R>>\nstartxref\n%ld\n%%%%EOF\n", writer->obj_num, t_pos); |
960 | | } |
961 | | } |
962 | | |
963 | | static void |
964 | | pdfocr_drop_band_writer(fz_context *ctx, fz_band_writer *writer_) |
965 | | { |
966 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
967 | | |
968 | | fz_free(ctx, writer->options.options); |
969 | | fz_free(ctx, writer->stripbuf); |
970 | | fz_free(ctx, writer->compbuf); |
971 | | fz_free(ctx, writer->page_obj); |
972 | | fz_free(ctx, writer->xref); |
973 | | fz_drop_pixmap(ctx, writer->ocrbitmap); |
974 | | ocr_fin(ctx, writer->tessapi); |
975 | | } |
976 | | #endif |
977 | | |
978 | | fz_band_writer *fz_new_pdfocr_band_writer(fz_context *ctx, fz_output *out, const fz_pdfocr_options *options) |
979 | 0 | { |
980 | 0 | #ifdef OCR_DISABLED |
981 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
982 | | #else |
983 | | pdfocr_band_writer *writer = fz_new_band_writer(ctx, pdfocr_band_writer, out); |
984 | | |
985 | | writer->super.header = pdfocr_write_header; |
986 | | writer->super.band = pdfocr_write_band; |
987 | | writer->super.trailer = pdfocr_write_trailer; |
988 | | writer->super.close = pdfocr_close_band_writer; |
989 | | writer->super.drop = pdfocr_drop_band_writer; |
990 | | |
991 | | if (options) |
992 | | { |
993 | | writer->options = *options; |
994 | | writer->options.options = NULL; |
995 | | } |
996 | | else |
997 | | memset(&writer->options, 0, sizeof(writer->options)); |
998 | | |
999 | | /* Objects: |
1000 | | * 1 reserved for catalog |
1001 | | * 2 for pages tree |
1002 | | * 3 font |
1003 | | * 4 cidfont |
1004 | | * 5 cid to gid map |
1005 | | * 6 tounicode |
1006 | | * 7 font descriptor |
1007 | | * 8 font file |
1008 | | */ |
1009 | | writer->obj_num = 9; |
1010 | | |
1011 | | fz_try(ctx) |
1012 | | { |
1013 | | if (options && options->options) |
1014 | | writer->options.options = fz_strdup(ctx, options->options); |
1015 | | writer->tessapi = ocr_init(ctx, writer->options.language, writer->options.datadir, writer->options.options); |
1016 | | } |
1017 | | fz_catch(ctx) |
1018 | | { |
1019 | | fz_drop_band_writer(ctx, &writer->super); |
1020 | | fz_rethrow(ctx); |
1021 | | } |
1022 | | |
1023 | | return &writer->super; |
1024 | | #endif |
1025 | 0 | } |
1026 | | |
1027 | | void |
1028 | | fz_pdfocr_band_writer_set_progress(fz_context *ctx, fz_band_writer *writer_, fz_pdfocr_progress_fn *progress, void *progress_arg) |
1029 | 0 | { |
1030 | 0 | #ifdef OCR_DISABLED |
1031 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
1032 | | #else |
1033 | | pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_; |
1034 | | if (writer == NULL) |
1035 | | return; |
1036 | | if (writer->super.header != pdfocr_write_header) |
1037 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "Not a pdfocr band writer!"); |
1038 | | |
1039 | | writer->progress = progress; |
1040 | | writer->progress_arg = progress_arg; |
1041 | | #endif |
1042 | 0 | } |
1043 | | |
1044 | | void |
1045 | | fz_save_pixmap_as_pdfocr(fz_context *ctx, fz_pixmap *pixmap, char *filename, int append, const fz_pdfocr_options *pdfocr) |
1046 | 0 | { |
1047 | 0 | #ifdef OCR_DISABLED |
1048 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
1049 | | #else |
1050 | | fz_output *out = fz_new_output_with_path(ctx, filename, append); |
1051 | | fz_try(ctx) |
1052 | | { |
1053 | | fz_write_pixmap_as_pdfocr(ctx, out, pixmap, pdfocr); |
1054 | | fz_close_output(ctx, out); |
1055 | | } |
1056 | | fz_always(ctx) |
1057 | | fz_drop_output(ctx, out); |
1058 | | fz_catch(ctx) |
1059 | | fz_rethrow(ctx); |
1060 | | #endif |
1061 | 0 | } |
1062 | | |
1063 | | /* High-level document writer interface */ |
1064 | | |
1065 | | #ifndef OCR_DISABLED |
1066 | | typedef struct |
1067 | | { |
1068 | | fz_document_writer super; |
1069 | | fz_draw_options draw; |
1070 | | fz_pdfocr_options pdfocr; |
1071 | | fz_pixmap *pixmap; |
1072 | | fz_band_writer *bander; |
1073 | | fz_output *out; |
1074 | | int pagenum; |
1075 | | } fz_pdfocr_writer; |
1076 | | |
1077 | | static fz_device * |
1078 | | pdfocr_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox) |
1079 | | { |
1080 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
1081 | | return fz_new_draw_device_with_options(ctx, &wri->draw, mediabox, &wri->pixmap); |
1082 | | } |
1083 | | |
1084 | | static void |
1085 | | pdfocr_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev) |
1086 | | { |
1087 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
1088 | | fz_pixmap *pix = wri->pixmap; |
1089 | | |
1090 | | fz_try(ctx) |
1091 | | { |
1092 | | fz_close_device(ctx, dev); |
1093 | | fz_write_header(ctx, wri->bander, pix->w, pix->h, pix->n, pix->alpha, pix->xres, pix->yres, wri->pagenum++, pix->colorspace, pix->seps); |
1094 | | fz_write_band(ctx, wri->bander, pix->stride, pix->h, pix->samples); |
1095 | | } |
1096 | | fz_always(ctx) |
1097 | | { |
1098 | | fz_drop_device(ctx, dev); |
1099 | | fz_drop_pixmap(ctx, pix); |
1100 | | wri->pixmap = NULL; |
1101 | | } |
1102 | | fz_catch(ctx) |
1103 | | fz_rethrow(ctx); |
1104 | | } |
1105 | | |
1106 | | static void |
1107 | | pdfocr_close_writer(fz_context *ctx, fz_document_writer *wri_) |
1108 | | { |
1109 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
1110 | | |
1111 | | fz_close_band_writer(ctx, wri->bander); |
1112 | | fz_close_output(ctx, wri->out); |
1113 | | } |
1114 | | |
1115 | | static void |
1116 | | pdfocr_drop_writer(fz_context *ctx, fz_document_writer *wri_) |
1117 | | { |
1118 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_; |
1119 | | |
1120 | | fz_drop_pixmap(ctx, wri->pixmap); |
1121 | | fz_drop_band_writer(ctx, wri->bander); |
1122 | | fz_drop_output(ctx, wri->out); |
1123 | | } |
1124 | | #endif |
1125 | | |
1126 | | fz_document_writer * |
1127 | | fz_new_pdfocr_writer_with_output(fz_context *ctx, fz_output *out, const char *options) |
1128 | 0 | { |
1129 | 0 | #ifdef OCR_DISABLED |
1130 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
1131 | | #else |
1132 | | fz_pdfocr_writer *wri = NULL; |
1133 | | |
1134 | | fz_var(wri); |
1135 | | |
1136 | | fz_try(ctx) |
1137 | | { |
1138 | | wri = fz_new_derived_document_writer(ctx, fz_pdfocr_writer, pdfocr_begin_page, pdfocr_end_page, pdfocr_close_writer, pdfocr_drop_writer); |
1139 | | fz_parse_draw_options(ctx, &wri->draw, options); |
1140 | | fz_parse_pdfocr_options(ctx, &wri->pdfocr, options); |
1141 | | wri->out = out; |
1142 | | wri->bander = fz_new_pdfocr_band_writer(ctx, wri->out, &wri->pdfocr); |
1143 | | } |
1144 | | fz_catch(ctx) |
1145 | | { |
1146 | | fz_drop_output(ctx, out); |
1147 | | fz_free(ctx, wri); |
1148 | | fz_rethrow(ctx); |
1149 | | } |
1150 | | |
1151 | | return (fz_document_writer*)wri; |
1152 | | #endif |
1153 | 0 | } |
1154 | | |
1155 | | fz_document_writer * |
1156 | | fz_new_pdfocr_writer(fz_context *ctx, const char *path, const char *options) |
1157 | 0 | { |
1158 | 0 | #ifdef OCR_DISABLED |
1159 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
1160 | | #else |
1161 | | fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.pdfocr", 0); |
1162 | | return fz_new_pdfocr_writer_with_output(ctx, out, options); |
1163 | | #endif |
1164 | 0 | } |
1165 | | |
1166 | | void |
1167 | | fz_pdfocr_writer_set_progress(fz_context *ctx, fz_document_writer *writer, fz_pdfocr_progress_fn *progress, void *progress_arg) |
1168 | 0 | { |
1169 | 0 | #ifdef OCR_DISABLED |
1170 | 0 | fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build"); |
1171 | | #else |
1172 | | fz_pdfocr_writer *wri = (fz_pdfocr_writer *)writer; |
1173 | | if (!writer) |
1174 | | return; |
1175 | | if (writer->begin_page != pdfocr_begin_page) |
1176 | | fz_throw(ctx, FZ_ERROR_ARGUMENT, "Not a pdfocr writer!"); |
1177 | | fz_pdfocr_band_writer_set_progress(ctx, wri->bander, progress, progress_arg); |
1178 | | #endif |
1179 | 0 | } |