Coverage Report

Created: 2026-06-30 07:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/fitz/output-pdfocr.c
Line
Count
Source
1
// Copyright (C) 2004-2025 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
25
#include <assert.h>
26
#include <string.h>
27
#include <limits.h>
28
29
#ifdef OCR_DISABLED
30
31
/* In non-OCR builds, we need to define this otherwise SWIG Python gets SEGV
32
when it attempts to import mupdf.py and _mupdf.py. */
33
const char *fz_pdfocr_write_options_usage = "";
34
35
#else
36
37
#include "tessocr.h"
38
39
const char *fz_pdfocr_write_options_usage =
40
  "PDFOCR output options:\n"
41
  "\tcompression=none: No compression (default)\n"
42
  "\tcompression=flate: Flate compression\n"
43
  "\tstrip-height=N: Strip height (default 0=fullpage)\n"
44
  "\tocr-language=<lang>: OCR language (default=eng)\n"
45
  "\tocr-datadir=<datadir>: OCR data path (default=rely on TESSDATA_PREFIX)\n"
46
  "\tskew=none,auto,<angle>: Whether to skew correct (default=none).\n"
47
  "\tskew-border=increase,maintain,decrease: Size change for border pixels (default=increase).\n"
48
  "\n";
49
50
static const char funky_font[] =
51
"3 0 obj\n<</BaseFont/GlyphLessFont/DescendantFonts[4 0 R]"
52
"/Encoding/Identity-H/Subtype/Type0/ToUnicode 6 0 R/Type/Font"
53
">>\nendobj\n";
54
55
static const char funky_font2[] =
56
"4 0 obj\n"
57
"<</BaseFont/GlyphLessFont/CIDToGIDMap 5 0 R"
58
"/CIDSystemInfo<</Ordering (Identity)/Registry (Adobe)/Supplement 0>>"
59
"/FontDescriptor 7 0 R/Subtype/CIDFontType2/Type/Font/DW 500>>"
60
"\nendobj\n";
61
62
static const char funky_font3[] =
63
"5 0 obj\n<</Length 210/Filter/FlateDecode>>\nstream\n"
64
"\x78\x9c\xec\xc2\x01\x09\x00\x00\x00\x02\xa0\xfa\x7f\xba\x21\x89"
65
"\xa6\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
66
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
67
"\x80\x7b\x03\x00\x00\xff\xff\xec\xc2\x01\x0d\x00\x00\x00\xc2\x20"
68
"\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
69
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
70
"\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xec\xc2\x01\x0d\x00"
71
"\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00"
72
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
73
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xed"
74
"\xc2\x01\x0d\x00\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00"
75
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
76
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\xff"
77
"\x00\x10"
78
"\nendstream\nendobj\n";
79
80
static const char funky_font4[] =
81
"6 0 obj\n<</Length 353>>\nstream\n"
82
"/CIDInit /ProcSet findresource begin\n"
83
"12 dict begin\n"
84
"begincmap\n"
85
"/CIDSystemInfo\n"
86
"<<\n"
87
"  /Registry (Adobe)\n"
88
"  /Ordering (UCS)\n"
89
"  /Supplement 0\n"
90
">> def\n"
91
"/CMapName /Adobe-Identity-UCS def\n"
92
"/CMapType 2 def\n"
93
"1 begincodespacerange\n"
94
"<0000> <FFFF>\n"
95
"endcodespacerange\n"
96
"1 beginbfrange\n"
97
"<0000> <FFFF> <0000>\n"
98
"endbfrange\n"
99
"endcmap\n"
100
"CMapName currentdict /CMap defineresource pop\n"
101
"end\n"
102
"end\n"
103
"endstream\n"
104
"endobj\n";
105
106
static const char funky_font5[] =
107
"7 0 obj\n"
108
"<</Ascent 1000/CapHeight 1000/Descent -1/Flags 5"
109
"/FontBBox[0 0 500 1000]/FontFile2 8 0 R/FontName/GlyphLessFont"
110
"/ItalicAngle 0/StemV 80/Type/FontDescriptor>>\nendobj\n";
111
112
static const char funky_font6[] =
113
"8 0 obj\n<</Length 572/Length1 572>>\nstream\n"
114
"\x00\x01\x00\x00\x00\x0a\x00\x80\x00\x03\x00\x20\x4f\x53\x2f\x32"
115
"\x56\xde\xc8\x94\x00\x00\x01\x28\x00\x00\x00\x60\x63\x6d\x61\x70"
116
"\x00\x0a\x00\x34\x00\x00\x01\x90\x00\x00\x00\x1e\x67\x6c\x79\x66"
117
"\x15\x22\x41\x24\x00\x00\x01\xb8\x00\x00\x00\x18\x68\x65\x61\x64"
118
"\x0b\x78\xf1\x65\x00\x00\x00\xac\x00\x00\x00\x36\x68\x68\x65\x61"
119
"\x0c\x02\x04\x02\x00\x00\x00\xe4\x00\x00\x00\x24\x68\x6d\x74\x78"
120
"\x04\x00\x00\x00\x00\x00\x01\x88\x00\x00\x00\x08\x6c\x6f\x63\x61"
121
"\x00\x0c\x00\x00\x00\x00\x01\xb0\x00\x00\x00\x06\x6d\x61\x78\x70"
122
"\x00\x04\x00\x05\x00\x00\x01\x08\x00\x00\x00\x20\x6e\x61\x6d\x65"
123
"\xf2\xeb\x16\xda\x00\x00\x01\xd0\x00\x00\x00\x4b\x70\x6f\x73\x74"
124
"\x00\x01\x00\x01\x00\x00\x02\x1c\x00\x00\x00\x20\x00\x01\x00\x00"
125
"\x00\x01\x00\x00\xb0\x94\x71\x10\x5f\x0f\x3c\xf5\x04\x07\x08\x00"
126
"\x00\x00\x00\x00\xcf\x9a\xfc\x6e\x00\x00\x00\x00\xd4\xc3\xa7\xf2"
127
"\x00\x00\x00\x00\x04\x00\x08\x00\x00\x00\x00\x10\x00\x02\x00\x00"
128
"\x00\x00\x00\x00\x00\x01\x00\x00\x08\x00\xff\xff\x00\x00\x04\x00"
129
"\x00\x00\x00\x00\x04\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"
130
"\x00\x00\x00\x00\x00\x00\x00\x02\x00\x01\x00\x00\x00\x02\x00\x04"
131
"\x00\x01\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"
132
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x01\x90\x00\x05"
133
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
134
"\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x01\x00\x01\x00\x00\x00"
135
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
136
"\x00\x00\x47\x4f\x4f\x47\x00\x40\x00\x00\x00\x00\x00\x01\xff\xff"
137
"\x00\x00\x00\x01\x00\x01\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00"
138
"\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00"
139
"\x00\x00\x00\x02\x00\x01\x00\x00\x00\x00\x00\x14\x00\x03\x00\x00"
140
"\x00\x00\x00\x14\x00\x06\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00"
141
"\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x00\x04\x00"
142
"\x08\x00\x00\x03\x00\x00\x31\x21\x11\x21\x04\x00\xfc\x00\x08\x00"
143
"\x00\x00\x00\x03\x00\x2a\x00\x00\x00\x03\x00\x00\x00\x05\x00\x16"
144
"\x00\x00\x00\x01\x00\x00\x00\x00\x00\x05\x00\x0b\x00\x16\x00\x03"
145
"\x00\x01\x04\x09\x00\x05\x00\x16\x00\x00\x00\x56\x00\x65\x00\x72"
146
"\x00\x73\x00\x69\x00\x6f\x00\x6e\x00\x20\x00\x31\x00\x2e\x00\x30"
147
"\x56\x65\x72\x73\x69\x6f\x6e\x20\x31\x2e\x30\x00\x00\x01\x00\x00"
148
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00"
149
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
150
"\nendstream\nendobj\n";
151
152
#endif
153
154
void
155
fz_init_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts)
156
0
{
157
0
  memset(opts, 0, sizeof *opts);
158
0
}
159
160
fz_pdfocr_options *
161
fz_parse_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts, const char *args)
162
0
{
163
0
#ifdef OCR_DISABLED
164
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
165
#else
166
  fz_options *options = fz_new_options(ctx, args);
167
  fz_try(ctx)
168
  {
169
    fz_init_pdfocr_options(ctx, opts);
170
    fz_apply_pdfocr_options(ctx, opts, options);
171
    fz_throw_on_unused_options(ctx, options, "pdfocr");
172
  }
173
  fz_always(ctx)
174
    fz_drop_options(ctx, options);
175
  fz_catch(ctx)
176
    fz_rethrow(ctx);
177
#endif
178
0
  return opts;
179
0
}
180
181
void
182
fz_apply_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts, fz_options *args)
183
0
{
184
0
#ifdef OCR_DISABLED
185
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
186
#else
187
  const char *val;
188
189
  if (fz_lookup_option(ctx, args, "compression", &val))
190
  {
191
    if (!strcmp(val, "none"))
192
      opts->compress = 0;
193
    else if (!strcmp(val, "flate"))
194
      opts->compress = 1;
195
    else
196
      fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported PDFOCR compression %s (none, or flate only)", val);
197
  }
198
  if (fz_lookup_option(ctx, args, "strip-height", &val))
199
  {
200
    int i = fz_atoi(val);
201
    if (i <= 0)
202
      fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported PDFOCR strip height %d (suggest 0)", i);
203
    opts->strip_height = i;
204
  }
205
  if (fz_lookup_option(ctx, args, "ocr-language", &val))
206
  {
207
    fz_strlcpy(opts->language, val, nelem(opts->language));
208
  }
209
  if (fz_lookup_option(ctx, args, "ocr-datadir", &val))
210
  {
211
    fz_strlcpy(opts->datadir, val, nelem(opts->datadir));
212
  }
213
  if (fz_lookup_option(ctx, args, "skew", &val))
214
  {
215
    if (!strcmp(val, "auto"))
216
      opts->skew_correct = 1;
217
    else
218
    {
219
      opts->skew_correct = 2;
220
      opts->skew_angle = fz_atof(val);
221
    }
222
  }
223
  if (fz_lookup_option(ctx, args, "skew-border", &val))
224
  {
225
    if (!strcmp(val, "increase"))
226
      opts->skew_border = 0;
227
    else if (!strcmp(val, "maintain"))
228
      opts->skew_border = 1;
229
    else if (!strcmp(val, "decrease"))
230
      opts->skew_border = 2;
231
    else
232
      fz_throw(ctx, FZ_ERROR_ARGUMENT, "Unsupported skew-border option");
233
  }
234
235
  fz_validate_options(ctx, args, "pdfocr");
236
#endif
237
0
}
238
239
void
240
fz_write_pixmap_as_pdfocr(fz_context *ctx, fz_output *out, const fz_pixmap *pixmap, const fz_pdfocr_options *pdfocr)
241
0
{
242
0
#ifdef OCR_DISABLED
243
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
244
#else
245
  fz_band_writer *writer;
246
247
  if (!pixmap || !out)
248
    return;
249
250
  writer = fz_new_pdfocr_band_writer(ctx, out, pdfocr);
251
  fz_try(ctx)
252
  {
253
    fz_write_header(ctx, writer, pixmap->w, pixmap->h, pixmap->n, pixmap->alpha, pixmap->xres, pixmap->yres, 0, pixmap->colorspace, pixmap->seps);
254
    fz_write_band(ctx, writer, pixmap->stride, pixmap->h, pixmap->samples);
255
    fz_close_band_writer(ctx, writer);
256
  }
257
  fz_always(ctx)
258
    fz_drop_band_writer(ctx, writer);
259
  fz_catch(ctx)
260
    fz_rethrow(ctx);
261
#endif
262
0
}
263
264
#ifndef OCR_DISABLED
265
typedef struct pdfocr_band_writer_s
266
{
267
  fz_band_writer super;
268
  fz_pdfocr_options options;
269
270
  /* The actual output size */
271
  int deskewed_w;
272
  int deskewed_h;
273
274
  int obj_num;
275
  int xref_max;
276
  int64_t *xref;
277
  int pages;
278
  int page_max;
279
  int *page_obj;
280
  unsigned char *stripbuf;
281
  unsigned char *compbuf;
282
  size_t complen;
283
284
  fz_pixmap *skew_bitmap;
285
286
  void *tessapi;
287
  fz_pixmap *ocrbitmap;
288
289
  fz_pdfocr_progress_fn *progress;
290
  void *progress_arg;
291
} pdfocr_band_writer;
292
293
static int
294
new_obj(fz_context *ctx, pdfocr_band_writer *writer)
295
{
296
  int64_t pos = fz_tell_output(ctx, writer->super.out);
297
298
  if (writer->obj_num >= writer->xref_max)
299
  {
300
    int new_max = writer->xref_max * 2;
301
    if (new_max < writer->obj_num + 8)
302
      new_max = writer->obj_num + 8;
303
    writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t);
304
    writer->xref_max = new_max;
305
  }
306
307
  writer->xref[writer->obj_num] = pos;
308
309
  return writer->obj_num++;
310
}
311
312
static void
313
post_skew_write_header(fz_context *ctx, pdfocr_band_writer *writer, int w, int h)
314
{
315
  fz_output *out = writer->super.out;
316
  int xres = writer->super.xres;
317
  int yres = writer->super.yres;
318
  int sh = writer->options.strip_height;
319
  int n = writer->super.n;
320
  int strips;
321
  int i;
322
323
  if (sh == 0)
324
    sh = h;
325
  assert(sh != 0 && "pdfocr_write_header() should not be given zero height input.");
326
  strips = (h + sh-1)/sh;
327
328
  writer->deskewed_w = w;
329
  writer->deskewed_h = h;
330
331
  writer->stripbuf = Memento_label(fz_malloc(ctx, (size_t)w * sh * n), "pdfocr_stripbuf");
332
  writer->complen = fz_deflate_bound(ctx, (size_t)w * sh * n);
333
  writer->compbuf = Memento_label(fz_malloc(ctx, writer->complen), "pdfocr_compbuf");
334
335
  /* Always round the width of ocrbitmap up to a multiple of 4. */
336
  writer->ocrbitmap = fz_new_pixmap(ctx, NULL, (w+3)&~3, h, NULL, 0);
337
  fz_set_pixmap_resolution(ctx, writer->ocrbitmap, xres, yres);
338
339
  /* Send the Page Object */
340
  fz_write_printf(ctx, out, "%d 0 obj\n<</Type/Page/Parent 2 0 R/Resources<</XObject<<", new_obj(ctx, writer));
341
  for (i = 0; i < strips; i++)
342
    fz_write_printf(ctx, out, "/I%d %d 0 R", i, writer->obj_num + i);
343
  fz_write_printf(ctx, out, ">>/Font<</F0 3 0 R>>>>/MediaBox[0 0 %g %g]/Contents %d 0 R>>\nendobj\n",
344
    w * 72.0f / xres, h * 72.0f / yres, writer->obj_num + strips);
345
}
346
347
static void
348
pdfocr_write_header(fz_context *ctx, fz_band_writer *writer_, fz_colorspace *cs)
349
{
350
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
351
  fz_output *out = writer->super.out;
352
  int w = writer->super.w;
353
  int h = writer->super.h;
354
  int n = writer->super.n;
355
  int s = writer->super.s;
356
  int a = writer->super.alpha;
357
  int sh = writer->options.strip_height;
358
359
  if (sh == 0)
360
    sh = h;
361
  assert(sh != 0 && "pdfocr_write_header() should not be given zero height input.");
362
363
  if (a != 0)
364
    fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR cannot write alpha channel");
365
  if (s != 0)
366
    fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR cannot write spot colors");
367
  if (n != 3 && n != 1)
368
    fz_throw(ctx, FZ_ERROR_ARGUMENT, "PDFOCR expected to be Grayscale or RGB");
369
370
  fz_free(ctx, writer->stripbuf);
371
  writer->stripbuf = NULL;
372
  fz_free(ctx, writer->compbuf);
373
  writer->compbuf = NULL;
374
  fz_drop_pixmap(ctx, writer->ocrbitmap);
375
  writer->ocrbitmap = NULL;
376
377
  /* Send the file header on the first page */
378
  if (writer->pages == 0)
379
  {
380
    fz_write_string(ctx, out, "%PDF-1.4\n%PDFOCR-1.0\n");
381
382
    if (writer->xref_max < 9)
383
    {
384
      int new_max = 9;
385
      writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t);
386
      writer->xref_max = new_max;
387
    }
388
    writer->xref[3] = fz_tell_output(ctx, out);
389
    fz_write_data(ctx, out, funky_font,  sizeof(funky_font)-1);
390
    writer->xref[4] = fz_tell_output(ctx, out);
391
    fz_write_data(ctx, out, funky_font2, sizeof(funky_font2)-1);
392
    writer->xref[5] = fz_tell_output(ctx, out);
393
    fz_write_data(ctx, out, funky_font3, sizeof(funky_font3)-1);
394
    writer->xref[6] = fz_tell_output(ctx, out);
395
    fz_write_data(ctx, out, funky_font4, sizeof(funky_font4)-1);
396
    writer->xref[7] = fz_tell_output(ctx, out);
397
    fz_write_data(ctx, out, funky_font5, sizeof(funky_font5)-1);
398
    writer->xref[8] = fz_tell_output(ctx, out);
399
    fz_write_data(ctx, out, funky_font6, sizeof(funky_font6)-1);
400
  }
401
402
  if (writer->page_max <= writer->pages)
403
  {
404
    int new_max = writer->page_max * 2;
405
    if (new_max == 0)
406
      new_max = writer->pages + 8;
407
    writer->page_obj = fz_realloc_array(ctx, writer->page_obj, new_max, int);
408
    writer->page_max = new_max;
409
  }
410
  writer->page_obj[writer->pages] = writer->obj_num;
411
  writer->pages++;
412
413
  if (writer->options.skew_correct)
414
    writer->skew_bitmap = fz_new_pixmap(ctx, n == 3 ? fz_device_rgb(ctx) : fz_device_gray(ctx), w, h, NULL, 0);
415
  else
416
    post_skew_write_header(ctx, writer, w, h);
417
}
418
419
static void
420
flush_strip(fz_context *ctx, pdfocr_band_writer *writer, int fill)
421
{
422
  unsigned char *data = writer->stripbuf;
423
  fz_output *out = writer->super.out;
424
  int w = writer->deskewed_w;
425
  int n = writer->super.n;
426
  size_t len = (size_t)w*n*fill;
427
428
  /* Buffer is full, compress it and write it. */
429
  if (writer->options.compress)
430
  {
431
    size_t destLen = writer->complen;
432
    fz_deflate(ctx, writer->compbuf, &destLen, data, len, FZ_DEFLATE_DEFAULT);
433
    len = destLen;
434
    data = writer->compbuf;
435
  }
436
  fz_write_printf(ctx, out, "%d 0 obj\n<</Width %d/ColorSpace/Device%s/Height %d%s/Subtype/Image",
437
    new_obj(ctx, writer), w, n == 1 ? "Gray" : "RGB", fill, writer->options.compress ? "/Filter/FlateDecode" : "");
438
  fz_write_printf(ctx, out, "/Length %zd/Type/XObject/BitsPerComponent 8>>\nstream\n", len);
439
  fz_write_data(ctx, out, data, len);
440
  fz_write_string(ctx, out, "\nendstream\nendobj\n");
441
}
442
443
static void
444
post_skew_write_band(fz_context *ctx, pdfocr_band_writer *writer, int stride, int band_start, int band_height, const unsigned char *sp)
445
{
446
  int w = writer->deskewed_w;
447
  int h = writer->deskewed_h;
448
  int n = writer->super.n;
449
  int x, y;
450
  int sh = writer->options.strip_height;
451
  int line;
452
  unsigned char *d;
453
454
  if (sh == 0)
455
    sh = h;
456
457
  for (line = 0; line < band_height; line++)
458
  {
459
    int dstline = (band_start+line) % sh;
460
    memcpy(writer->stripbuf + (size_t)w*n*dstline,
461
      sp + (size_t)line * w * n,
462
      (size_t)w * n);
463
    if (dstline+1 == sh)
464
      flush_strip(ctx, writer, dstline+1);
465
  }
466
  if (band_start + band_height == h && h % sh != 0)
467
    flush_strip(ctx, writer, h % sh);
468
469
  /* Copy strip to ocrbitmap, converting if required. */
470
  d = writer->ocrbitmap->samples;
471
  d += band_start*w;
472
  if (n == 1)
473
  {
474
    for (y = band_height; y > 0; y--)
475
    {
476
      memcpy(d, sp, w);
477
      if (writer->ocrbitmap->w - w)
478
        memset(d + w, 0, writer->ocrbitmap->w - w);
479
      d += writer->ocrbitmap->w;
480
    }
481
  }
482
  else
483
  {
484
    for (y = band_height; y > 0; y--)
485
    {
486
      for (x = w; x > 0; x--)
487
      {
488
        *d++ = (sp[0] + 2*sp[1] + sp[2] + 2)>>2;
489
        sp += 3;
490
      }
491
      for (x = writer->ocrbitmap->w - w; x > 0; x--)
492
        *d++ = 0;
493
    }
494
  }
495
}
496
497
static void
498
pdfocr_write_band(fz_context *ctx, fz_band_writer *writer_, int stride, int band_start, int band_height, const unsigned char *sp)
499
{
500
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
501
  fz_output *out = writer->super.out;
502
  int w = writer->super.w;
503
  int n = writer->super.n;
504
  unsigned char *d;
505
506
  if (!out)
507
    return;
508
509
  if (writer->skew_bitmap)
510
  {
511
    d = writer->skew_bitmap->samples;
512
    d += band_start*w*n;
513
    memcpy(d, sp, w*n*band_height);
514
  }
515
  else
516
    post_skew_write_band(ctx, writer, stride, band_start, band_height, sp);
517
}
518
519
enum
520
{
521
  WORD_CONTAINS_L2R = 1,
522
  WORD_CONTAINS_R2L = 2,
523
  WORD_CONTAINS_T2B = 4,
524
  WORD_CONTAINS_B2T = 8
525
};
526
527
typedef struct word_t
528
{
529
  struct word_t *next;
530
  float bbox[4];
531
  int dirn;
532
  int len;
533
  int chars[FZ_FLEXIBLE_ARRAY];
534
} word_t;
535
536
typedef struct
537
{
538
  fz_buffer *buf;
539
  pdfocr_band_writer *writer;
540
541
  /* We collate the current word into the following fields: */
542
  int word_max;
543
  int word_len;
544
  int *word_chars;
545
  float word_bbox[4];
546
  int word_dirn;
547
  int word_prev_char_bbox[4];
548
549
  /* When we finish a word, we try to add it to the line. If the
550
   * word fits onto the end of the existing line, great. If not,
551
   * we flush the entire line, and start a new one just with the
552
   * new word. This enables us to output a whole line at once,
553
   * which is beneficial to avoid jittering the font sizes
554
   * up/down, which looks bad when we try to select text in the
555
   * produced PDF. */
556
  word_t *line;
557
  word_t **line_tail;
558
  float line_bbox[4];
559
  int line_dirn;
560
561
  float cur_size;
562
  float cur_scale;
563
  float tx, ty;
564
} char_callback_data_t;
565
566
static void
567
flush_words(fz_context *ctx, char_callback_data_t *cb)
568
{
569
  float size;
570
571
  if (cb->line == NULL)
572
    return;
573
574
  if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0)
575
  {
576
    /* Vertical line */
577
  }
578
  else
579
  {
580
    /* Horizontal line */
581
    size = cb->line_bbox[3] - cb->line_bbox[1];
582
583
    if (size != 0 && size != cb->cur_size)
584
    {
585
      fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size);
586
      cb->cur_size = size;
587
    }
588
    /* Guard against division by 0. This makes no difference to the
589
     * actual calculation as if size is 0, word->bbox[2] == word->bbox[0]
590
     * too. */
591
    if (size == 0)
592
      size = 1;
593
  }
594
595
  while (cb->line)
596
  {
597
    word_t *word = cb->line;
598
    float x, y;
599
    int i, len = word->len;
600
    float scale;
601
602
    if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0)
603
    {
604
      /* Contains vertical text. */
605
      size = (word->bbox[3] - word->bbox[1]) / len;
606
      if (size == 0)
607
        size = 1;
608
      if (size != cb->cur_size)
609
      {
610
        fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size);
611
        cb->cur_size = size;
612
      }
613
614
      /* Set the scale so that our glyphs fill the line bbox. */
615
      scale = (cb->line_bbox[2] - cb->line_bbox[0]) / size * 200;
616
      if (scale != 0)
617
      {
618
        float letter_height = (word->bbox[3] - word->bbox[1]) / len;
619
620
        if (scale != cb->cur_scale)
621
        {
622
          fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale);
623
          cb->cur_scale = scale;
624
        }
625
626
        for (i = 0; i < len; i++)
627
        {
628
          x = word->bbox[0];
629
          y = word->bbox[1] + letter_height * i;
630
          fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty);
631
          cb->tx = x;
632
          cb->ty = y;
633
634
          fz_append_printf(ctx, cb->buf, "<%04x>Tj\n", word->chars[i]);
635
        }
636
      }
637
    }
638
    else
639
    {
640
      scale = (word->bbox[2] - word->bbox[0]) / size / len * 200;
641
      if (scale != 0)
642
      {
643
        if (scale != cb->cur_scale)
644
        {
645
          fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale);
646
          cb->cur_scale = scale;
647
        }
648
649
        if ((word->dirn & (WORD_CONTAINS_R2L | WORD_CONTAINS_L2R)) == WORD_CONTAINS_R2L)
650
        {
651
          /* Purely R2L text */
652
          x = word->bbox[0];
653
          y = cb->line_bbox[1];
654
          fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty);
655
          cb->tx = x;
656
          cb->ty = y;
657
658
          /* Tesseract has sent us R2L text in R2L order (i.e. in Logical order).
659
           * We want to output it in that same logical order, but PDF operators
660
           * all move the point as if outputting L2R. We can either reverse the
661
           * order of chars (bad, because of cut/paste) or we can perform
662
           * gymnastics with the position. We opt for the latter. */
663
          fz_append_printf(ctx, cb->buf, "[");
664
          for (i = 0; i < len; i++)
665
          {
666
            if (i == 0)
667
            {
668
              if (len > 1)
669
                fz_append_printf(ctx, cb->buf, "%d", -500*(len-1));
670
            }
671
            else
672
              fz_append_printf(ctx, cb->buf, "%d", 1000);
673
            fz_append_printf(ctx, cb->buf, "<%04x>", word->chars[i]);
674
          }
675
          fz_append_printf(ctx, cb->buf, "]TJ\n");
676
        }
677
        else
678
        {
679
          /* L2R (or mixed) text */
680
          x = word->bbox[0];
681
          y = cb->line_bbox[1];
682
          fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty);
683
          cb->tx = x;
684
          cb->ty = y;
685
686
          fz_append_printf(ctx, cb->buf, "<");
687
          for (i = 0; i < len; i++)
688
            fz_append_printf(ctx, cb->buf, "%04x", word->chars[i]);
689
          fz_append_printf(ctx, cb->buf, ">Tj\n");
690
        }
691
      }
692
    }
693
694
    cb->line = word->next;
695
    fz_free(ctx, word);
696
  }
697
698
  cb->line_tail = &cb->line;
699
  cb->line = NULL;
700
  cb->line_dirn = 0;
701
}
702
703
static void
704
queue_word(fz_context *ctx, char_callback_data_t *cb)
705
{
706
  word_t *word;
707
  int line_is_v, line_is_h, word_is_v, word_is_h;
708
709
  if (cb->word_len == 0)
710
    return;
711
712
  word = fz_malloc_flexible(ctx, word_t, chars, cb->word_len);
713
  word->next = NULL;
714
  word->len = cb->word_len;
715
  memcpy(word->bbox, cb->word_bbox, 4*sizeof(float));
716
  memcpy(word->chars, cb->word_chars, cb->word_len * sizeof(int));
717
  cb->word_len = 0;
718
719
  line_is_v = !!(cb->line_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B));
720
  word_is_v = !!(cb->word_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B));
721
  line_is_h = !!(cb->line_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L));
722
  word_is_h = !!(cb->word_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L));
723
724
  word->dirn = cb->word_dirn;
725
  cb->word_dirn = 0;
726
727
  /* Can we put the new word onto the end of the existing line? */
728
  if (cb->line != NULL &&
729
    !line_is_v && !word_is_v &&
730
    word->bbox[1] <= cb->line_bbox[3] &&
731
    word->bbox[3] >= cb->line_bbox[1] &&
732
    (word->bbox[0] >= cb->line_bbox[2] || word->bbox[2] <= cb->line_bbox[0]))
733
  {
734
    /* Can append (horizontal motion). */
735
    if (word->bbox[0] < cb->line_bbox[0])
736
      cb->line_bbox[0] = word->bbox[0];
737
    if (word->bbox[1] < cb->line_bbox[1])
738
      cb->line_bbox[1] = word->bbox[1];
739
    if (word->bbox[2] > cb->line_bbox[2])
740
      cb->line_bbox[2] = word->bbox[2];
741
    if (word->bbox[3] > cb->line_bbox[3])
742
      cb->line_bbox[3] = word->bbox[3];
743
  }
744
  else if (cb->line != NULL &&
745
    !line_is_h && !word_is_h &&
746
    word->bbox[0] <= cb->line_bbox[2] &&
747
    word->bbox[2] >= cb->line_bbox[0] &&
748
    (word->bbox[1] >= cb->line_bbox[3] || word->bbox[3] <= cb->line_bbox[1]))
749
  {
750
    /* Can append (vertical motion). */
751
    if (!word_is_v)
752
      word->dirn |= WORD_CONTAINS_T2B;
753
    if (word->bbox[0] < cb->line_bbox[0])
754
      cb->line_bbox[0] = word->bbox[0];
755
    if (word->bbox[1] < cb->line_bbox[1])
756
      cb->line_bbox[1] = word->bbox[1];
757
    if (word->bbox[2] > cb->line_bbox[2])
758
      cb->line_bbox[2] = word->bbox[2];
759
    if (word->bbox[3] > cb->line_bbox[3])
760
      cb->line_bbox[3] = word->bbox[3];
761
  }
762
  else
763
  {
764
    fz_try(ctx)
765
      flush_words(ctx, cb);
766
    fz_catch(ctx)
767
    {
768
      fz_free(ctx, word);
769
      fz_rethrow(ctx);
770
    }
771
    memcpy(cb->line_bbox, word->bbox, 4*sizeof(float));
772
  }
773
774
  *cb->line_tail = word;
775
  cb->line_tail = &word->next;
776
  cb->line_dirn |= word->dirn;
777
}
778
779
static void
780
char_callback(fz_context *ctx, void *arg, int unicode,
781
    const char *font_name,
782
    const int *line_bbox, const int *word_bbox,
783
    const int *char_bbox, int pointsize)
784
{
785
  char_callback_data_t *cb = (char_callback_data_t *)arg;
786
  pdfocr_band_writer *writer = cb->writer;
787
  float bbox[4];
788
789
  bbox[0] = word_bbox[0] * 72.0f / cb->writer->ocrbitmap->xres;
790
  bbox[3] = (writer->ocrbitmap->h - 1 - word_bbox[1]) * 72.0f / cb->writer->ocrbitmap->yres;
791
  bbox[2] = word_bbox[2] * 72.0f / cb->writer->ocrbitmap->yres;
792
  bbox[1] = (writer->ocrbitmap->h - 1 - word_bbox[3]) * 72.0f / cb->writer->ocrbitmap->yres;
793
794
  if (bbox[0] != cb->word_bbox[0] ||
795
    bbox[1] != cb->word_bbox[1] ||
796
    bbox[2] != cb->word_bbox[2] ||
797
    bbox[3] != cb->word_bbox[3])
798
  {
799
    queue_word(ctx, cb);
800
    memcpy(cb->word_bbox, bbox, 4 * sizeof(float));
801
  }
802
803
  if (cb->word_len == 0)
804
  {
805
    cb->word_dirn = 0;
806
    memcpy(cb->word_prev_char_bbox, char_bbox, 4 * sizeof(int));
807
  }
808
  else
809
  {
810
    int ox = cb->word_prev_char_bbox[0] + cb->word_prev_char_bbox[2];
811
    int oy = cb->word_prev_char_bbox[1] + cb->word_prev_char_bbox[3];
812
    int x = char_bbox[0] + char_bbox[2] - ox;
813
    int y = char_bbox[1] + char_bbox[3] - oy;
814
    int ax = x < 0 ? -x : x;
815
    int ay = y < 0 ? -y : y;
816
    if (ax > ay)
817
    {
818
      if (x > 0)
819
        cb->word_dirn |= WORD_CONTAINS_L2R;
820
      else if (x < 0)
821
        cb->word_dirn |= WORD_CONTAINS_R2L;
822
    }
823
    else if (ay < ax)
824
    {
825
      if (y > 0)
826
        cb->word_dirn |= WORD_CONTAINS_T2B;
827
      else if (y < 0)
828
        cb->word_dirn |= WORD_CONTAINS_B2T;
829
    }
830
  }
831
832
  if (cb->word_max == cb->word_len)
833
  {
834
    int newmax = cb->word_max * 2;
835
    if (newmax == 0)
836
      newmax = 16;
837
    cb->word_chars = fz_realloc_array(ctx, cb->word_chars, newmax, int);
838
    cb->word_max = newmax;
839
  }
840
841
  cb->word_chars[cb->word_len++] = unicode;
842
}
843
844
static int
845
pdfocr_progress(fz_context *ctx, void *arg, int prog)
846
{
847
  char_callback_data_t *cb = (char_callback_data_t *)arg;
848
  pdfocr_band_writer *writer = cb->writer;
849
850
  if (writer->progress == NULL)
851
    return 0;
852
853
  return writer->progress(ctx, writer->progress_arg, writer->pages - 1, prog);
854
}
855
856
static void
857
do_skew_correct(fz_context *ctx, pdfocr_band_writer *writer)
858
{
859
  fz_pixmap *deskewed;
860
861
  if (writer->options.skew_correct == 1)
862
    writer->options.skew_angle = fz_detect_skew(ctx, writer->skew_bitmap);
863
864
  deskewed = fz_deskew_pixmap(ctx, writer->skew_bitmap, writer->options.skew_angle, writer->options.skew_border);
865
866
  fz_try(ctx)
867
  {
868
    post_skew_write_header(ctx, writer, deskewed->w, deskewed->h);
869
    post_skew_write_band(ctx, writer, deskewed->stride, 0, deskewed->h, deskewed->samples);
870
  }
871
  fz_always(ctx)
872
    fz_drop_pixmap(ctx, deskewed);
873
  fz_catch(ctx)
874
    fz_rethrow(ctx);
875
}
876
877
static void
878
pdfocr_write_trailer(fz_context *ctx, fz_band_writer *writer_)
879
{
880
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
881
  fz_output *out = writer->super.out;
882
  int xres = writer->super.xres;
883
  int yres = writer->super.yres;
884
  int sh = writer->options.strip_height;
885
  int strips;
886
  int w, h, i;
887
  size_t len;
888
  unsigned char *data;
889
  fz_buffer *buf = NULL;
890
  char_callback_data_t cb = { NULL };
891
892
  if (writer->options.skew_correct)
893
    do_skew_correct(ctx, writer);
894
895
  w = writer->deskewed_w;
896
  h = writer->deskewed_h;
897
  if (sh == 0)
898
    sh = h;
899
  strips = (h + sh-1)/sh;
900
901
  /* Send the Page contents */
902
  /* We need the length to this, so write to a buffer first */
903
  fz_var(buf);
904
  fz_var(cb);
905
  fz_try(ctx)
906
  {
907
    cb.writer = writer;
908
    cb.buf = buf = fz_new_buffer(ctx, 0);
909
    cb.line_tail = &cb.line;
910
    cb.word_dirn = 0;
911
    cb.line_dirn = 0;
912
    fz_append_printf(ctx, buf, "q\n%g 0 0 %g 0 0 cm\n", 72.0f/xres, 72.0f/yres);
913
    for (i = 0; i < strips; i++)
914
    {
915
      int at = h - (i+1)*sh;
916
      int this_sh = sh;
917
      if (at < 0)
918
      {
919
        this_sh += at;
920
        at = 0;
921
      }
922
      fz_append_printf(ctx, buf, "/P <</MCID 0>> BDC\nq\n%d 0 0 %d 0 %d cm\n/I%d Do\nQ\n",
923
        w, this_sh, at, i);
924
    }
925
926
    fz_append_printf(ctx, buf, "Q\nBT\n3 Tr\n");
927
928
    ocr_recognise(ctx, writer->tessapi, writer->ocrbitmap, char_callback, pdfocr_progress, &cb);
929
    queue_word(ctx, &cb);
930
    flush_words(ctx, &cb);
931
    fz_append_printf(ctx, buf, "ET\n");
932
933
    len = fz_buffer_storage(ctx, buf, &data);
934
    fz_write_printf(ctx, out, "%d 0 obj\n<</Length %zd>>\nstream\n", new_obj(ctx, writer), len);
935
    fz_write_data(ctx, out, data, len);
936
    fz_drop_buffer(ctx, buf);
937
    buf = NULL;
938
    fz_write_string(ctx, out, "\nendstream\nendobj\n");
939
  }
940
  fz_always(ctx)
941
  {
942
    fz_free(ctx, cb.word_chars);
943
  }
944
  fz_catch(ctx)
945
  {
946
    fz_drop_buffer(ctx, buf);
947
    fz_rethrow(ctx);
948
  }
949
}
950
951
static void
952
pdfocr_close_band_writer(fz_context *ctx, fz_band_writer *writer_)
953
{
954
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
955
  fz_output *out = writer->super.out;
956
  int i;
957
958
  fz_warn_on_unused_options(ctx, writer->options.options, "pdfocr band-writer");
959
960
  /* We actually do the trailer writing in the close */
961
  if (writer->xref_max > 2)
962
  {
963
    int64_t t_pos;
964
965
    /* Catalog */
966
    writer->xref[1] = fz_tell_output(ctx, out);
967
    fz_write_printf(ctx, out, "1 0 obj\n<</Type/Catalog/Pages 2 0 R>>\nendobj\n");
968
969
    /* Page table */
970
    writer->xref[2] = fz_tell_output(ctx, out);
971
    fz_write_printf(ctx, out, "2 0 obj\n<</Count %d/Kids[", writer->pages);
972
973
    for (i = 0; i < writer->pages; i++)
974
    {
975
      if (i > 0)
976
        fz_write_byte(ctx, out, ' ');
977
      fz_write_printf(ctx, out, "%d 0 R", writer->page_obj[i]);
978
    }
979
    fz_write_string(ctx, out, "]/Type/Pages>>\nendobj\n");
980
981
    /* Xref */
982
    t_pos = fz_tell_output(ctx, out);
983
    fz_write_printf(ctx, out, "xref\n0 %d\n0000000000 65535 f \n", writer->obj_num);
984
    for (i = 1; i < writer->obj_num; i++)
985
      fz_write_printf(ctx, out, "%010ld 00000 n \n", writer->xref[i]);
986
    fz_write_printf(ctx, out, "trailer\n<</Size %d/Root 1 0 R>>\nstartxref\n%ld\n%%%%EOF\n", writer->obj_num, t_pos);
987
  }
988
}
989
990
static void
991
pdfocr_drop_band_writer(fz_context *ctx, fz_band_writer *writer_)
992
{
993
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
994
995
  fz_drop_options(ctx, writer->options.options);
996
  fz_free(ctx, writer->stripbuf);
997
  fz_free(ctx, writer->compbuf);
998
  fz_free(ctx, writer->page_obj);
999
  fz_free(ctx, writer->xref);
1000
  fz_drop_pixmap(ctx, writer->ocrbitmap);
1001
  ocr_fin(ctx, writer->tessapi);
1002
}
1003
#endif
1004
1005
fz_band_writer *fz_new_pdfocr_band_writer(fz_context *ctx, fz_output *out, const fz_pdfocr_options *options)
1006
0
{
1007
0
#ifdef OCR_DISABLED
1008
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
1009
#else
1010
  pdfocr_band_writer *writer = fz_new_band_writer(ctx, pdfocr_band_writer, out);
1011
1012
  writer->super.header = pdfocr_write_header;
1013
  writer->super.band = pdfocr_write_band;
1014
  writer->super.trailer = pdfocr_write_trailer;
1015
  writer->super.close = pdfocr_close_band_writer;
1016
  writer->super.drop = pdfocr_drop_band_writer;
1017
1018
  if (options)
1019
  {
1020
    writer->options = *options;
1021
    writer->options.options = NULL;
1022
  }
1023
  else
1024
    memset(&writer->options, 0, sizeof(writer->options));
1025
1026
  /* Objects:
1027
   *  1 reserved for catalog
1028
   *  2 for pages tree
1029
   *  3 font
1030
   *  4 cidfont
1031
   *  5 cid to gid map
1032
   *  6 tounicode
1033
   *  7 font descriptor
1034
   *  8 font file
1035
   */
1036
  writer->obj_num = 9;
1037
1038
  fz_try(ctx)
1039
  {
1040
    if (options && options->options)
1041
      writer->options.options = fz_keep_options(ctx, options->options);
1042
    writer->tessapi = ocr_init(ctx, writer->options.language, writer->options.datadir, writer->options.options);
1043
  }
1044
  fz_catch(ctx)
1045
  {
1046
    fz_drop_band_writer(ctx, &writer->super);
1047
    fz_rethrow(ctx);
1048
  }
1049
1050
  return &writer->super;
1051
#endif
1052
0
}
1053
1054
void
1055
fz_pdfocr_band_writer_set_progress(fz_context *ctx, fz_band_writer *writer_, fz_pdfocr_progress_fn *progress, void *progress_arg)
1056
0
{
1057
0
#ifdef OCR_DISABLED
1058
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
1059
#else
1060
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
1061
  if (writer == NULL)
1062
    return;
1063
  if (writer->super.header != pdfocr_write_header)
1064
    fz_throw(ctx, FZ_ERROR_ARGUMENT, "Not a pdfocr band writer!");
1065
1066
  writer->progress = progress;
1067
  writer->progress_arg = progress_arg;
1068
#endif
1069
0
}
1070
1071
void
1072
fz_save_pixmap_as_pdfocr(fz_context *ctx, fz_pixmap *pixmap, char *filename, int append, const fz_pdfocr_options *pdfocr)
1073
0
{
1074
0
#ifdef OCR_DISABLED
1075
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
1076
#else
1077
  fz_output *out = fz_new_output_with_path(ctx, filename, append);
1078
  fz_try(ctx)
1079
  {
1080
    fz_write_pixmap_as_pdfocr(ctx, out, pixmap, pdfocr);
1081
    fz_close_output(ctx, out);
1082
  }
1083
  fz_always(ctx)
1084
    fz_drop_output(ctx, out);
1085
  fz_catch(ctx)
1086
    fz_rethrow(ctx);
1087
#endif
1088
0
}
1089
1090
/* High-level document writer interface */
1091
1092
#ifndef OCR_DISABLED
1093
typedef struct
1094
{
1095
  fz_document_writer super;
1096
  fz_draw_options draw;
1097
  fz_pdfocr_options pdfocr;
1098
  fz_pixmap *pixmap;
1099
  fz_band_writer *bander;
1100
  fz_output *out;
1101
  int pagenum;
1102
} fz_pdfocr_writer;
1103
1104
static fz_device *
1105
pdfocr_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
1106
{
1107
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
1108
  return fz_new_draw_device_with_options(ctx, &wri->draw, mediabox, &wri->pixmap);
1109
}
1110
1111
static void
1112
pdfocr_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
1113
{
1114
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
1115
  fz_pixmap *pix = wri->pixmap;
1116
1117
  fz_try(ctx)
1118
  {
1119
    fz_close_device(ctx, dev);
1120
    fz_write_header(ctx, wri->bander, pix->w, pix->h, pix->n, pix->alpha, pix->xres, pix->yres, wri->pagenum++, pix->colorspace, pix->seps);
1121
    fz_write_band(ctx, wri->bander, pix->stride, pix->h, pix->samples);
1122
  }
1123
  fz_always(ctx)
1124
  {
1125
    fz_drop_device(ctx, dev);
1126
    fz_drop_pixmap(ctx, pix);
1127
    wri->pixmap = NULL;
1128
  }
1129
  fz_catch(ctx)
1130
    fz_rethrow(ctx);
1131
}
1132
1133
static void
1134
pdfocr_close_writer(fz_context *ctx, fz_document_writer *wri_)
1135
{
1136
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
1137
1138
  fz_close_band_writer(ctx, wri->bander);
1139
  fz_close_output(ctx, wri->out);
1140
}
1141
1142
static void
1143
pdfocr_drop_writer(fz_context *ctx, fz_document_writer *wri_)
1144
{
1145
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
1146
1147
  fz_drop_pixmap(ctx, wri->pixmap);
1148
  fz_drop_band_writer(ctx, wri->bander);
1149
  fz_drop_output(ctx, wri->out);
1150
}
1151
#endif
1152
1153
fz_document_writer *
1154
fz_new_pdfocr_writer_with_output(fz_context *ctx, fz_output *out, const char *options_string)
1155
0
{
1156
0
#ifdef OCR_DISABLED
1157
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
1158
#else
1159
  fz_options *options = NULL;
1160
  fz_pdfocr_writer *wri = NULL;
1161
1162
  fz_var(wri);
1163
  fz_var(options);
1164
1165
  fz_try(ctx)
1166
  {
1167
    options = fz_new_options(ctx, options_string);
1168
    wri = fz_new_derived_document_writer(ctx, fz_pdfocr_writer, pdfocr_begin_page, pdfocr_end_page, pdfocr_close_writer, pdfocr_drop_writer);
1169
    fz_init_draw_options(ctx, &wri->draw);
1170
    fz_init_pdfocr_options(ctx, &wri->pdfocr);
1171
    fz_apply_draw_options(ctx, &wri->draw, options);
1172
    fz_apply_pdfocr_options(ctx, &wri->pdfocr, options);
1173
    fz_throw_on_unused_options(ctx, options, "draw and ocr");
1174
    wri->out = out;
1175
    wri->bander = fz_new_pdfocr_band_writer(ctx, wri->out, &wri->pdfocr);
1176
  }
1177
  fz_catch(ctx)
1178
  {
1179
    fz_drop_options(ctx, options);
1180
    fz_drop_output(ctx, out);
1181
    fz_free(ctx, wri);
1182
    fz_rethrow(ctx);
1183
  }
1184
1185
  return (fz_document_writer*)wri;
1186
#endif
1187
0
}
1188
1189
fz_document_writer *
1190
fz_new_pdfocr_writer(fz_context *ctx, const char *path, const char *options)
1191
0
{
1192
0
#ifdef OCR_DISABLED
1193
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
1194
#else
1195
  fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.pdfocr", 0);
1196
  return fz_new_pdfocr_writer_with_output(ctx, out, options);
1197
#endif
1198
0
}
1199
1200
void
1201
fz_pdfocr_writer_set_progress(fz_context *ctx, fz_document_writer *writer, fz_pdfocr_progress_fn *progress, void *progress_arg)
1202
0
{
1203
0
#ifdef OCR_DISABLED
1204
0
  fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "No OCR support in this build");
1205
#else
1206
  fz_pdfocr_writer *wri = (fz_pdfocr_writer *)writer;
1207
  if (!writer)
1208
    return;
1209
  if (writer->begin_page != pdfocr_begin_page)
1210
    fz_throw(ctx, FZ_ERROR_ARGUMENT, "Not a pdfocr writer!");
1211
  fz_pdfocr_band_writer_set_progress(ctx, wri->bander, progress, progress_arg);
1212
#endif
1213
0
}