Coverage Report

Created: 2023-06-07 06:20

/src/mupdf/source/fitz/output-pdfocr.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2021 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
25
#include <assert.h>
26
#include <string.h>
27
#include <limits.h>
28
29
#ifdef OCR_DISABLED
30
31
/* In non-OCR builds, we need to define this otherwise SWIG Python gets SEGV
32
when it attempts to import mupdf.py and _mupdf.py. */
33
const char *fz_pdfocr_write_options_usage = "";
34
35
#else
36
37
#include "tessocr.h"
38
39
const char *fz_pdfocr_write_options_usage =
40
  "PDFOCR output options:\n"
41
  "\tcompression=none: No compression (default)\n"
42
  "\tcompression=flate: Flate compression\n"
43
  "\tstrip-height=N: Strip height (default 0=fullpage)\n"
44
  "\tocr-language=<lang>: OCR language (default=eng)\n"
45
  "\tocr-datadir=<datadir>: OCR data path (default=rely on TESSDATA_PREFIX)\n"
46
  "\n";
47
48
static const char funky_font[] =
49
"3 0 obj\n<</BaseFont/GlyphLessFont/DescendantFonts[4 0 R]"
50
"/Encoding/Identity-H/Subtype/Type0/ToUnicode 6 0 R/Type/Font"
51
">>\nendobj\n";
52
53
static const char funky_font2[] =
54
"4 0 obj\n"
55
"<</BaseFont/GlyphLessFont/CIDToGIDMap 5 0 R"
56
"/CIDSystemInfo<</Ordering (Identity)/Registry (Adobe)/Supplement 0>>"
57
"/FontDescriptor 7 0 R/Subtype/CIDFontType2/Type/Font/DW 500>>"
58
"\nendobj\n";
59
60
static const char funky_font3[] =
61
"5 0 obj\n<</Length 210/Filter/FlateDecode>>\nstream\n"
62
"\x78\x9c\xec\xc2\x01\x09\x00\x00\x00\x02\xa0\xfa\x7f\xba\x21\x89"
63
"\xa6\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
64
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
65
"\x80\x7b\x03\x00\x00\xff\xff\xec\xc2\x01\x0d\x00\x00\x00\xc2\x20"
66
"\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
67
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
68
"\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xec\xc2\x01\x0d\x00"
69
"\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00"
70
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
71
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xed"
72
"\xc2\x01\x0d\x00\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00"
73
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
74
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\xff"
75
"\x00\x10"
76
"\nendstream\nendobj\n";
77
78
static const char funky_font4[] =
79
"6 0 obj\n<</Length 353>>\nstream\n"
80
"/CIDInit /ProcSet findresource begin\n"
81
"12 dict begin\n"
82
"begincmap\n"
83
"/CIDSystemInfo\n"
84
"<<\n"
85
"  /Registry (Adobe)\n"
86
"  /Ordering (UCS)\n"
87
"  /Supplement 0\n"
88
">> def\n"
89
"/CMapName /Adobe-Identity-UCS def\n"
90
"/CMapType 2 def\n"
91
"1 begincodespacerange\n"
92
"<0000> <FFFF>\n"
93
"endcodespacerange\n"
94
"1 beginbfrange\n"
95
"<0000> <FFFF> <0000>\n"
96
"endbfrange\n"
97
"endcmap\n"
98
"CMapName currentdict /CMap defineresource pop\n"
99
"end\n"
100
"end\n"
101
"endstream\n"
102
"endobj\n";
103
104
static const char funky_font5[] =
105
"7 0 obj\n"
106
"<</Ascent 1000/CapHeight 1000/Descent -1/Flags 5"
107
"/FontBBox[0 0 500 1000]/FontFile2 8 0 R/FontName/GlyphLessFont"
108
"/ItalicAngle 0/StemV 80/Type/FontDescriptor>>\nendobj\n";
109
110
static const char funky_font6[] =
111
"8 0 obj\n<</Length 572/Length1 572>>\nstream\n"
112
"\x00\x01\x00\x00\x00\x0a\x00\x80\x00\x03\x00\x20\x4f\x53\x2f\x32"
113
"\x56\xde\xc8\x94\x00\x00\x01\x28\x00\x00\x00\x60\x63\x6d\x61\x70"
114
"\x00\x0a\x00\x34\x00\x00\x01\x90\x00\x00\x00\x1e\x67\x6c\x79\x66"
115
"\x15\x22\x41\x24\x00\x00\x01\xb8\x00\x00\x00\x18\x68\x65\x61\x64"
116
"\x0b\x78\xf1\x65\x00\x00\x00\xac\x00\x00\x00\x36\x68\x68\x65\x61"
117
"\x0c\x02\x04\x02\x00\x00\x00\xe4\x00\x00\x00\x24\x68\x6d\x74\x78"
118
"\x04\x00\x00\x00\x00\x00\x01\x88\x00\x00\x00\x08\x6c\x6f\x63\x61"
119
"\x00\x0c\x00\x00\x00\x00\x01\xb0\x00\x00\x00\x06\x6d\x61\x78\x70"
120
"\x00\x04\x00\x05\x00\x00\x01\x08\x00\x00\x00\x20\x6e\x61\x6d\x65"
121
"\xf2\xeb\x16\xda\x00\x00\x01\xd0\x00\x00\x00\x4b\x70\x6f\x73\x74"
122
"\x00\x01\x00\x01\x00\x00\x02\x1c\x00\x00\x00\x20\x00\x01\x00\x00"
123
"\x00\x01\x00\x00\xb0\x94\x71\x10\x5f\x0f\x3c\xf5\x04\x07\x08\x00"
124
"\x00\x00\x00\x00\xcf\x9a\xfc\x6e\x00\x00\x00\x00\xd4\xc3\xa7\xf2"
125
"\x00\x00\x00\x00\x04\x00\x08\x00\x00\x00\x00\x10\x00\x02\x00\x00"
126
"\x00\x00\x00\x00\x00\x01\x00\x00\x08\x00\xff\xff\x00\x00\x04\x00"
127
"\x00\x00\x00\x00\x04\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"
128
"\x00\x00\x00\x00\x00\x00\x00\x02\x00\x01\x00\x00\x00\x02\x00\x04"
129
"\x00\x01\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"
130
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x01\x90\x00\x05"
131
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
132
"\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x01\x00\x01\x00\x00\x00"
133
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
134
"\x00\x00\x47\x4f\x4f\x47\x00\x40\x00\x00\x00\x00\x00\x01\xff\xff"
135
"\x00\x00\x00\x01\x00\x01\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00"
136
"\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00"
137
"\x00\x00\x00\x02\x00\x01\x00\x00\x00\x00\x00\x14\x00\x03\x00\x00"
138
"\x00\x00\x00\x14\x00\x06\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00"
139
"\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x00\x04\x00"
140
"\x08\x00\x00\x03\x00\x00\x31\x21\x11\x21\x04\x00\xfc\x00\x08\x00"
141
"\x00\x00\x00\x03\x00\x2a\x00\x00\x00\x03\x00\x00\x00\x05\x00\x16"
142
"\x00\x00\x00\x01\x00\x00\x00\x00\x00\x05\x00\x0b\x00\x16\x00\x03"
143
"\x00\x01\x04\x09\x00\x05\x00\x16\x00\x00\x00\x56\x00\x65\x00\x72"
144
"\x00\x73\x00\x69\x00\x6f\x00\x6e\x00\x20\x00\x31\x00\x2e\x00\x30"
145
"\x56\x65\x72\x73\x69\x6f\x6e\x20\x31\x2e\x30\x00\x00\x01\x00\x00"
146
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00"
147
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
148
"\nendstream\nendobj\n";
149
150
#endif
151
152
fz_pdfocr_options *
153
fz_parse_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts, const char *args)
154
0
{
155
0
#ifdef OCR_DISABLED
156
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
157
#else
158
  const char *val;
159
160
  memset(opts, 0, sizeof *opts);
161
162
  if (fz_has_option(ctx, args, "compression", &val))
163
  {
164
    if (fz_option_eq(val, "none"))
165
      opts->compress = 0;
166
    else if (fz_option_eq(val, "flate"))
167
      opts->compress = 1;
168
    else
169
      fz_throw(ctx, FZ_ERROR_GENERIC, "Unsupported PDFOCR compression %s (none, or flate only)", val);
170
  }
171
  if (fz_has_option(ctx, args, "strip-height", &val))
172
  {
173
    int i = fz_atoi(val);
174
    if (i <= 0)
175
      fz_throw(ctx, FZ_ERROR_GENERIC, "Unsupported PDFOCR strip height %d (suggest 0)", i);
176
    opts->strip_height = i;
177
  }
178
  if (fz_has_option(ctx, args, "ocr-language", &val))
179
  {
180
    fz_copy_option(ctx, val, opts->language, nelem(opts->language));
181
  }
182
  if (fz_has_option(ctx, args, "ocr-datadir", &val))
183
  {
184
    fz_copy_option(ctx, val, opts->datadir, nelem(opts->datadir));
185
  }
186
187
  return opts;
188
#endif
189
0
}
190
191
void
192
fz_write_pixmap_as_pdfocr(fz_context *ctx, fz_output *out, const fz_pixmap *pixmap, const fz_pdfocr_options *pdfocr)
193
0
{
194
0
#ifdef OCR_DISABLED
195
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
196
#else
197
  fz_band_writer *writer;
198
199
  if (!pixmap || !out)
200
    return;
201
202
  writer = fz_new_pdfocr_band_writer(ctx, out, pdfocr);
203
  fz_try(ctx)
204
  {
205
    fz_write_header(ctx, writer, pixmap->w, pixmap->h, pixmap->n, pixmap->alpha, pixmap->xres, pixmap->yres, 0, pixmap->colorspace, pixmap->seps);
206
    fz_write_band(ctx, writer, pixmap->stride, pixmap->h, pixmap->samples);
207
    fz_close_band_writer(ctx, writer);
208
  }
209
  fz_always(ctx)
210
    fz_drop_band_writer(ctx, writer);
211
  fz_catch(ctx)
212
    fz_rethrow(ctx);
213
#endif
214
0
}
215
216
#ifndef OCR_DISABLED
217
typedef struct pdfocr_band_writer_s
218
{
219
  fz_band_writer super;
220
  fz_pdfocr_options options;
221
222
  int obj_num;
223
  int xref_max;
224
  int64_t *xref;
225
  int pages;
226
  int page_max;
227
  int *page_obj;
228
  unsigned char *stripbuf;
229
  unsigned char *compbuf;
230
  size_t complen;
231
232
  void *tessapi;
233
  fz_pixmap *ocrbitmap;
234
235
  fz_pdfocr_progress_fn *progress;
236
  void *progress_arg;
237
} pdfocr_band_writer;
238
239
static int
240
new_obj(fz_context *ctx, pdfocr_band_writer *writer)
241
{
242
  int64_t pos = fz_tell_output(ctx, writer->super.out);
243
244
  if (writer->obj_num >= writer->xref_max)
245
  {
246
    int new_max = writer->xref_max * 2;
247
    if (new_max < writer->obj_num + 8)
248
      new_max = writer->obj_num + 8;
249
    writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t);
250
    writer->xref_max = new_max;
251
  }
252
253
  writer->xref[writer->obj_num] = pos;
254
255
  return writer->obj_num++;
256
}
257
258
static void
259
pdfocr_write_header(fz_context *ctx, fz_band_writer *writer_, fz_colorspace *cs)
260
{
261
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
262
  fz_output *out = writer->super.out;
263
  int w = writer->super.w;
264
  int h = writer->super.h;
265
  int n = writer->super.n;
266
  int s = writer->super.s;
267
  int a = writer->super.alpha;
268
  int xres = writer->super.xres;
269
  int yres = writer->super.yres;
270
  int sh = writer->options.strip_height;
271
  int strips;
272
  int i;
273
274
  if (sh == 0)
275
    sh = h;
276
  assert(sh != 0 && "pdfocr_write_header() should not be given zero height input.");
277
  strips = (h + sh-1)/sh;
278
279
  if (a != 0)
280
    fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR cannot write alpha channel");
281
  if (s != 0)
282
    fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR cannot write spot colors");
283
  if (n != 3 && n != 1)
284
    fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR expected to be Grayscale or RGB");
285
286
  fz_free(ctx, writer->stripbuf);
287
  writer->stripbuf = NULL;
288
  fz_free(ctx, writer->compbuf);
289
  writer->compbuf = NULL;
290
  fz_drop_pixmap(ctx, writer->ocrbitmap);
291
  writer->ocrbitmap = NULL;
292
  writer->stripbuf = Memento_label(fz_malloc(ctx, (size_t)w * sh * n), "pdfocr_stripbuf");
293
  writer->complen = fz_deflate_bound(ctx, (size_t)w * sh * n);
294
  writer->compbuf = Memento_label(fz_malloc(ctx, writer->complen), "pdfocr_compbuf");
295
  /* Always round the width of ocrbitmap up to a multiple of 4. */
296
  writer->ocrbitmap = fz_new_pixmap(ctx, NULL, (w+3)&~3, h, NULL, 0);
297
  fz_set_pixmap_resolution(ctx, writer->ocrbitmap, xres, yres);
298
299
  /* Send the file header on the first page */
300
  if (writer->pages == 0)
301
  {
302
    fz_write_string(ctx, out, "%PDF-1.4\n%PDFOCR-1.0\n");
303
304
    if (writer->xref_max < 9)
305
    {
306
      int new_max = 9;
307
      writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t);
308
      writer->xref_max = new_max;
309
    }
310
    writer->xref[3] = fz_tell_output(ctx, out);
311
    fz_write_data(ctx, out, funky_font,  sizeof(funky_font)-1);
312
    writer->xref[4] = fz_tell_output(ctx, out);
313
    fz_write_data(ctx, out, funky_font2, sizeof(funky_font2)-1);
314
    writer->xref[5] = fz_tell_output(ctx, out);
315
    fz_write_data(ctx, out, funky_font3, sizeof(funky_font3)-1);
316
    writer->xref[6] = fz_tell_output(ctx, out);
317
    fz_write_data(ctx, out, funky_font4, sizeof(funky_font4)-1);
318
    writer->xref[7] = fz_tell_output(ctx, out);
319
    fz_write_data(ctx, out, funky_font5, sizeof(funky_font5)-1);
320
    writer->xref[8] = fz_tell_output(ctx, out);
321
    fz_write_data(ctx, out, funky_font6, sizeof(funky_font6)-1);
322
  }
323
324
  if (writer->page_max <= writer->pages)
325
  {
326
    int new_max = writer->page_max * 2;
327
    if (new_max == 0)
328
      new_max = writer->pages + 8;
329
    writer->page_obj = fz_realloc_array(ctx, writer->page_obj, new_max, int);
330
    writer->page_max = new_max;
331
  }
332
  writer->page_obj[writer->pages] = writer->obj_num;
333
  writer->pages++;
334
335
  /* Send the Page Object */
336
  fz_write_printf(ctx, out, "%d 0 obj\n<</Type/Page/Parent 2 0 R/Resources<</XObject<<", new_obj(ctx, writer));
337
  for (i = 0; i < strips; i++)
338
    fz_write_printf(ctx, out, "/I%d %d 0 R", i, writer->obj_num + i);
339
  fz_write_printf(ctx, out, ">>/Font<</F0 3 0 R>>>>/MediaBox[0 0 %g %g]/Contents %d 0 R>>\nendobj\n",
340
    w * 72.0f / xres, h * 72.0f / yres, writer->obj_num + strips);
341
}
342
343
static void
344
flush_strip(fz_context *ctx, pdfocr_band_writer *writer, int fill)
345
{
346
  unsigned char *data = writer->stripbuf;
347
  fz_output *out = writer->super.out;
348
  int w = writer->super.w;
349
  int n = writer->super.n;
350
  size_t len = (size_t)w*n*fill;
351
352
  /* Buffer is full, compress it and write it. */
353
  if (writer->options.compress)
354
  {
355
    size_t destLen = writer->complen;
356
    fz_deflate(ctx, writer->compbuf, &destLen, data, len, FZ_DEFLATE_DEFAULT);
357
    len = destLen;
358
    data = writer->compbuf;
359
  }
360
  fz_write_printf(ctx, out, "%d 0 obj\n<</Width %d/ColorSpace/Device%s/Height %d%s/Subtype/Image",
361
    new_obj(ctx, writer), w, n == 1 ? "Gray" : "RGB", fill, writer->options.compress ? "/Filter/FlateDecode" : "");
362
  fz_write_printf(ctx, out, "/Length %zd/Type/XObject/BitsPerComponent 8>>\nstream\n", len);
363
  fz_write_data(ctx, out, data, len);
364
  fz_write_string(ctx, out, "\nendstream\nendobj\n");
365
}
366
367
static void
368
pdfocr_write_band(fz_context *ctx, fz_band_writer *writer_, int stride, int band_start, int band_height, const unsigned char *sp)
369
{
370
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
371
  fz_output *out = writer->super.out;
372
  int w = writer->super.w;
373
  int h = writer->super.h;
374
  int n = writer->super.n;
375
  int sh = writer->options.strip_height;
376
  int line;
377
  unsigned char *d = writer->ocrbitmap->samples;
378
379
  if (!out)
380
    return;
381
382
  if (sh == 0)
383
    sh = h;
384
385
  for (line = 0; line < band_height; line++)
386
  {
387
    int dstline = (band_start+line) % sh;
388
    memcpy(writer->stripbuf + (size_t)w*n*dstline,
389
         sp + (size_t)line * w * n,
390
         (size_t)w * n);
391
    if (dstline+1 == sh)
392
      flush_strip(ctx, writer, dstline+1);
393
  }
394
395
  if (band_start + band_height == h && h % sh != 0)
396
    flush_strip(ctx, writer, h % sh);
397
398
  /* Copy strip to ocrbitmap, converting if required. */
399
  d += band_start*w;
400
  if (n == 1)
401
  {
402
    int y;
403
    for (y = band_height; y > 0; y--)
404
    {
405
      memcpy(d, sp, w);
406
      if (writer->ocrbitmap->w - w)
407
        memset(d + w, 0, writer->ocrbitmap->w - w);
408
      d += writer->ocrbitmap->w;
409
    }
410
  }
411
  else
412
  {
413
    int x, y;
414
    for (y = band_height; y > 0; y--)
415
    {
416
      for (x = w; x > 0; x--)
417
      {
418
        *d++ = (sp[0] + 2*sp[1] + sp[2] + 2)>>2;
419
        sp += 3;
420
      }
421
      for (x = writer->ocrbitmap->w - w; x > 0; x--)
422
        *d++ = 0;
423
    }
424
  }
425
}
426
427
enum
428
{
429
  WORD_CONTAINS_L2R = 1,
430
  WORD_CONTAINS_R2L = 2,
431
  WORD_CONTAINS_T2B = 4,
432
  WORD_CONTAINS_B2T = 8
433
};
434
435
typedef struct word_t
436
{
437
  struct word_t *next;
438
  float bbox[4];
439
  int dirn;
440
  int len;
441
  int chars[1];
442
} word_t;
443
444
typedef struct
445
{
446
  fz_buffer *buf;
447
  pdfocr_band_writer *writer;
448
449
  /* We collate the current word into the following fields: */
450
  int word_max;
451
  int word_len;
452
  int *word_chars;
453
  float word_bbox[4];
454
  int word_dirn;
455
  int word_prev_char_bbox[4];
456
457
  /* When we finish a word, we try to add it to the line. If the
458
   * word fits onto the end of the existing line, great. If not,
459
   * we flush the entire line, and start a new one just with the
460
   * new word. This enables us to output a whole line at once,
461
   * which is beneficial to avoid jittering the font sizes
462
   * up/down, which looks bad when we try to select text in the
463
   * produced PDF. */
464
  word_t *line;
465
  word_t **line_tail;
466
  float line_bbox[4];
467
  int line_dirn;
468
469
  float cur_size;
470
  float cur_scale;
471
  float tx, ty;
472
} char_callback_data_t;
473
474
static void
475
flush_words(fz_context *ctx, char_callback_data_t *cb)
476
{
477
  float size;
478
479
  if (cb->line == NULL)
480
    return;
481
482
  if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0)
483
  {
484
    /* Vertical line */
485
  }
486
  else
487
  {
488
    /* Horizontal line */
489
    size = cb->line_bbox[3] - cb->line_bbox[1];
490
491
    if (size != 0 && size != cb->cur_size)
492
    {
493
      fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size);
494
      cb->cur_size = size;
495
    }
496
    /* Guard against division by 0. This makes no difference to the
497
     * actual calculation as if size is 0, word->bbox[2] == word->bbox[0]
498
     * too. */
499
    if (size == 0)
500
      size = 1;
501
  }
502
503
  while (cb->line)
504
  {
505
    word_t *word = cb->line;
506
    float x, y;
507
    int i, len = word->len;
508
    float scale;
509
510
    if ((cb->line_dirn & (WORD_CONTAINS_T2B | WORD_CONTAINS_B2T)) != 0)
511
    {
512
      /* Contains vertical text. */
513
      size = (word->bbox[3] - word->bbox[1]) / len;
514
      if (size == 0)
515
        size = 1;
516
      if (size != cb->cur_size)
517
      {
518
        fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size);
519
        cb->cur_size = size;
520
      }
521
522
      /* Set the scale so that our glyphs fill the line bbox. */
523
      scale = (cb->line_bbox[2] - cb->line_bbox[0]) / size * 200;
524
      if (scale != 0)
525
      {
526
        float letter_height = (word->bbox[3] - word->bbox[1]) / len;
527
528
        if (scale != cb->cur_scale)
529
        {
530
          fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale);
531
          cb->cur_scale = scale;
532
        }
533
534
        for (i = 0; i < len; i++)
535
        {
536
          x = word->bbox[0];
537
          y = word->bbox[1] + letter_height * i;
538
          fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty);
539
          cb->tx = x;
540
          cb->ty = y;
541
542
          fz_append_printf(ctx, cb->buf, "<%04x>Tj\n", word->chars[i]);
543
        }
544
      }
545
    }
546
    else
547
    {
548
      scale = (word->bbox[2] - word->bbox[0]) / size / len * 200;
549
      if (scale != 0)
550
      {
551
        if (scale != cb->cur_scale)
552
        {
553
          fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale);
554
          cb->cur_scale = scale;
555
        }
556
557
        if ((word->dirn & (WORD_CONTAINS_R2L | WORD_CONTAINS_L2R)) == WORD_CONTAINS_R2L)
558
        {
559
          /* Purely R2L text */
560
          x = word->bbox[0];
561
          y = cb->line_bbox[1];
562
          fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty);
563
          cb->tx = x;
564
          cb->ty = y;
565
566
          /* Tesseract has sent us R2L text in R2L order (i.e. in Logical order).
567
           * We want to output it in that same logical order, but PDF operators
568
           * all move the point as if outputting L2R. We can either reverse the
569
           * order of chars (bad, because of cut/paste) or we can perform
570
           * gymnastics with the position. We opt for the latter. */
571
          fz_append_printf(ctx, cb->buf, "[");
572
          for (i = 0; i < len; i++)
573
          {
574
            if (i == 0)
575
            {
576
              if (len > 1)
577
                fz_append_printf(ctx, cb->buf, "%d", -500*(len-1));
578
            }
579
            else
580
              fz_append_printf(ctx, cb->buf, "%d", 1000);
581
            fz_append_printf(ctx, cb->buf, "<%04x>", word->chars[i]);
582
          }
583
          fz_append_printf(ctx, cb->buf, "]TJ\n");
584
        }
585
        else
586
        {
587
          /* L2R (or mixed) text */
588
          x = word->bbox[0];
589
          y = cb->line_bbox[1];
590
          fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty);
591
          cb->tx = x;
592
          cb->ty = y;
593
594
          fz_append_printf(ctx, cb->buf, "<");
595
          for (i = 0; i < len; i++)
596
            fz_append_printf(ctx, cb->buf, "%04x", word->chars[i]);
597
          fz_append_printf(ctx, cb->buf, ">Tj\n");
598
        }
599
      }
600
    }
601
602
    cb->line = word->next;
603
    fz_free(ctx, word);
604
  }
605
606
  cb->line_tail = &cb->line;
607
  cb->line = NULL;
608
  cb->line_dirn = 0;
609
}
610
611
static void
612
queue_word(fz_context *ctx, char_callback_data_t *cb)
613
{
614
  word_t *word;
615
  int line_is_v, line_is_h, word_is_v, word_is_h;
616
617
  if (cb->word_len == 0)
618
    return;
619
620
  word = fz_malloc(ctx, sizeof(*word) + (cb->word_len-1)*sizeof(int));
621
  word->next = NULL;
622
  word->len = cb->word_len;
623
  memcpy(word->bbox, cb->word_bbox, 4*sizeof(float));
624
  memcpy(word->chars, cb->word_chars, cb->word_len * sizeof(int));
625
  cb->word_len = 0;
626
627
  line_is_v = !!(cb->line_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B));
628
  word_is_v = !!(cb->word_dirn & (WORD_CONTAINS_B2T | WORD_CONTAINS_T2B));
629
  line_is_h = !!(cb->line_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L));
630
  word_is_h = !!(cb->word_dirn & (WORD_CONTAINS_L2R | WORD_CONTAINS_R2L));
631
632
  word->dirn = cb->word_dirn;
633
  cb->word_dirn = 0;
634
635
  /* Can we put the new word onto the end of the existing line? */
636
  if (cb->line != NULL &&
637
    !line_is_v && !word_is_v &&
638
    word->bbox[1] <= cb->line_bbox[3] &&
639
    word->bbox[3] >= cb->line_bbox[1] &&
640
    (word->bbox[0] >= cb->line_bbox[2] || word->bbox[2] <= cb->line_bbox[0]))
641
  {
642
    /* Can append (horizontal motion). */
643
    if (word->bbox[0] < cb->line_bbox[0])
644
      cb->line_bbox[0] = word->bbox[0];
645
    if (word->bbox[1] < cb->line_bbox[1])
646
      cb->line_bbox[1] = word->bbox[1];
647
    if (word->bbox[2] > cb->line_bbox[2])
648
      cb->line_bbox[2] = word->bbox[2];
649
    if (word->bbox[3] > cb->line_bbox[3])
650
      cb->line_bbox[3] = word->bbox[3];
651
  }
652
  else if (cb->line != NULL &&
653
    !line_is_h && !word_is_h &&
654
    word->bbox[0] <= cb->line_bbox[2] &&
655
    word->bbox[2] >= cb->line_bbox[0] &&
656
    (word->bbox[1] >= cb->line_bbox[3] || word->bbox[3] <= cb->line_bbox[1]))
657
  {
658
    /* Can append (vertical motion). */
659
    if (!word_is_v)
660
      word->dirn |= WORD_CONTAINS_T2B;
661
    if (word->bbox[0] < cb->line_bbox[0])
662
      cb->line_bbox[0] = word->bbox[0];
663
    if (word->bbox[1] < cb->line_bbox[1])
664
      cb->line_bbox[1] = word->bbox[1];
665
    if (word->bbox[2] > cb->line_bbox[2])
666
      cb->line_bbox[2] = word->bbox[2];
667
    if (word->bbox[3] > cb->line_bbox[3])
668
      cb->line_bbox[3] = word->bbox[3];
669
  }
670
  else
671
  {
672
    fz_try(ctx)
673
      flush_words(ctx, cb);
674
    fz_catch(ctx)
675
    {
676
      fz_free(ctx, word);
677
      fz_rethrow(ctx);
678
    }
679
    memcpy(cb->line_bbox, word->bbox, 4*sizeof(float));
680
  }
681
682
  *cb->line_tail = word;
683
  cb->line_tail = &word->next;
684
  cb->line_dirn |= word->dirn;
685
}
686
687
static void
688
char_callback(fz_context *ctx, void *arg, int unicode,
689
    const char *font_name,
690
    const int *line_bbox, const int *word_bbox,
691
    const int *char_bbox, int pointsize)
692
{
693
  char_callback_data_t *cb = (char_callback_data_t *)arg;
694
  pdfocr_band_writer *writer = cb->writer;
695
  float bbox[4];
696
697
  bbox[0] = word_bbox[0] * 72.0f / cb->writer->ocrbitmap->xres;
698
  bbox[3] = (writer->ocrbitmap->h - 1 - word_bbox[1]) * 72.0f / cb->writer->ocrbitmap->yres;
699
  bbox[2] = word_bbox[2] * 72.0f / cb->writer->ocrbitmap->yres;
700
  bbox[1] = (writer->ocrbitmap->h - 1 - word_bbox[3]) * 72.0f / cb->writer->ocrbitmap->yres;
701
702
  if (bbox[0] != cb->word_bbox[0] ||
703
    bbox[1] != cb->word_bbox[1] ||
704
    bbox[2] != cb->word_bbox[2] ||
705
    bbox[3] != cb->word_bbox[3])
706
  {
707
    queue_word(ctx, cb);
708
    memcpy(cb->word_bbox, bbox, 4 * sizeof(float));
709
  }
710
711
  if (cb->word_len == 0)
712
  {
713
    cb->word_dirn = 0;
714
    memcpy(cb->word_prev_char_bbox, char_bbox, 4 * sizeof(int));
715
  }
716
  else
717
  {
718
    int ox = cb->word_prev_char_bbox[0] + cb->word_prev_char_bbox[2];
719
    int oy = cb->word_prev_char_bbox[1] + cb->word_prev_char_bbox[3];
720
    int x = char_bbox[0] + char_bbox[2] - ox;
721
    int y = char_bbox[1] + char_bbox[3] - oy;
722
    int ax = x < 0 ? -x : x;
723
    int ay = y < 0 ? -y : y;
724
    if (ax > ay)
725
    {
726
      if (x > 0)
727
        cb->word_dirn |= WORD_CONTAINS_L2R;
728
      else if (x < 0)
729
        cb->word_dirn |= WORD_CONTAINS_R2L;
730
    }
731
    else if (ay < ax)
732
    {
733
      if (y > 0)
734
        cb->word_dirn |= WORD_CONTAINS_T2B;
735
      else if (y < 0)
736
        cb->word_dirn |= WORD_CONTAINS_B2T;
737
    }
738
  }
739
740
  if (cb->word_max == cb->word_len)
741
  {
742
    int newmax = cb->word_max * 2;
743
    if (newmax == 0)
744
      newmax = 16;
745
    cb->word_chars = fz_realloc_array(ctx, cb->word_chars, newmax, int);
746
    cb->word_max = newmax;
747
  }
748
749
  cb->word_chars[cb->word_len++] = unicode;
750
}
751
752
static int
753
pdfocr_progress(fz_context *ctx, void *arg, int prog)
754
{
755
  char_callback_data_t *cb = (char_callback_data_t *)arg;
756
  pdfocr_band_writer *writer = cb->writer;
757
758
  if (writer->progress == NULL)
759
    return 0;
760
761
  return writer->progress(ctx, writer->progress_arg, writer->pages - 1, prog);
762
}
763
764
static void
765
pdfocr_write_trailer(fz_context *ctx, fz_band_writer *writer_)
766
{
767
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
768
  fz_output *out = writer->super.out;
769
  int w = writer->super.w;
770
  int h = writer->super.h;
771
  int xres = writer->super.xres;
772
  int yres = writer->super.yres;
773
  int sh = writer->options.strip_height;
774
  int strips;
775
  int i;
776
  size_t len;
777
  unsigned char *data;
778
  fz_buffer *buf = NULL;
779
  char_callback_data_t cb = { NULL };
780
781
  if (sh == 0)
782
    sh = h;
783
  strips = (h + sh-1)/sh;
784
785
  /* Send the Page contents */
786
  /* We need the length to this, so write to a buffer first */
787
  fz_var(buf);
788
  fz_var(cb);
789
  fz_try(ctx)
790
  {
791
    cb.writer = writer;
792
    cb.buf = buf = fz_new_buffer(ctx, 0);
793
    cb.line_tail = &cb.line;
794
    cb.word_dirn = 0;
795
    cb.line_dirn = 0;
796
    fz_append_printf(ctx, buf, "q\n%g 0 0 %g 0 0 cm\n", 72.0f/xres, 72.0f/yres);
797
    for (i = 0; i < strips; i++)
798
    {
799
      int at = h - (i+1)*sh;
800
      int this_sh = sh;
801
      if (at < 0)
802
      {
803
        this_sh += at;
804
        at = 0;
805
      }
806
      fz_append_printf(ctx, buf, "/P <</MCID 0>> BDC\nq\n%d 0 0 %d 0 %d cm\n/I%d Do\nQ\n",
807
        w, this_sh, at, i);
808
    }
809
810
    fz_append_printf(ctx, buf, "Q\nBT\n3 Tr\n");
811
812
    ocr_recognise(ctx, writer->tessapi, writer->ocrbitmap, char_callback, pdfocr_progress, &cb);
813
    queue_word(ctx, &cb);
814
    flush_words(ctx, &cb);
815
    fz_append_printf(ctx, buf, "ET\n");
816
817
    len = fz_buffer_storage(ctx, buf, &data);
818
    fz_write_printf(ctx, out, "%d 0 obj\n<</Length %zd>>\nstream\n", new_obj(ctx, writer), len);
819
    fz_write_data(ctx, out, data, len);
820
    fz_drop_buffer(ctx, buf);
821
    buf = NULL;
822
    fz_write_string(ctx, out, "\nendstream\nendobj\n");
823
  }
824
  fz_always(ctx)
825
  {
826
    fz_free(ctx, cb.word_chars);
827
  }
828
  fz_catch(ctx)
829
  {
830
    fz_drop_buffer(ctx, buf);
831
    fz_rethrow(ctx);
832
  }
833
}
834
835
static void
836
pdfocr_close_band_writer(fz_context *ctx, fz_band_writer *writer_)
837
{
838
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
839
  fz_output *out = writer->super.out;
840
  int i;
841
842
  /* We actually do the trailer writing in the close */
843
  if (writer->xref_max > 2)
844
  {
845
    int64_t t_pos;
846
847
    /* Catalog */
848
    writer->xref[1] = fz_tell_output(ctx, out);
849
    fz_write_printf(ctx, out, "1 0 obj\n<</Type/Catalog/Pages 2 0 R>>\nendobj\n");
850
851
    /* Page table */
852
    writer->xref[2] = fz_tell_output(ctx, out);
853
    fz_write_printf(ctx, out, "2 0 obj\n<</Count %d/Kids[", writer->pages);
854
855
    for (i = 0; i < writer->pages; i++)
856
    {
857
      if (i > 0)
858
        fz_write_byte(ctx, out, ' ');
859
      fz_write_printf(ctx, out, "%d 0 R", writer->page_obj[i]);
860
    }
861
    fz_write_string(ctx, out, "]/Type/Pages>>\nendobj\n");
862
863
    /* Xref */
864
    t_pos = fz_tell_output(ctx, out);
865
    fz_write_printf(ctx, out, "xref\n0 %d\n0000000000 65535 f \n", writer->obj_num);
866
    for (i = 1; i < writer->obj_num; i++)
867
      fz_write_printf(ctx, out, "%010ld 00000 n \n", writer->xref[i]);
868
    fz_write_printf(ctx, out, "trailer\n<</Size %d/Root 1 0 R>>\nstartxref\n%ld\n%%%%EOF\n", writer->obj_num, t_pos);
869
  }
870
}
871
872
static void
873
pdfocr_drop_band_writer(fz_context *ctx, fz_band_writer *writer_)
874
{
875
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
876
  fz_free(ctx, writer->stripbuf);
877
  fz_free(ctx, writer->compbuf);
878
  fz_free(ctx, writer->page_obj);
879
  fz_free(ctx, writer->xref);
880
  fz_drop_pixmap(ctx, writer->ocrbitmap);
881
  ocr_fin(ctx, writer->tessapi);
882
}
883
#endif
884
885
fz_band_writer *fz_new_pdfocr_band_writer(fz_context *ctx, fz_output *out, const fz_pdfocr_options *options)
886
0
{
887
0
#ifdef OCR_DISABLED
888
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
889
#else
890
  pdfocr_band_writer *writer = fz_new_band_writer(ctx, pdfocr_band_writer, out);
891
892
  writer->super.header = pdfocr_write_header;
893
  writer->super.band = pdfocr_write_band;
894
  writer->super.trailer = pdfocr_write_trailer;
895
  writer->super.close = pdfocr_close_band_writer;
896
  writer->super.drop = pdfocr_drop_band_writer;
897
898
  if (options)
899
    writer->options = *options;
900
  else
901
    memset(&writer->options, 0, sizeof(writer->options));
902
903
  /* Objects:
904
   *  1 reserved for catalog
905
   *  2 for pages tree
906
   *  3 font
907
   *  4 cidfont
908
   *  5 cid to gid map
909
   *  6 tounicode
910
   *  7 font descriptor
911
   *  8 font file
912
   */
913
  writer->obj_num = 9;
914
915
  fz_try(ctx)
916
  {
917
    writer->tessapi = ocr_init(ctx, writer->options.language, writer->options.datadir);
918
  }
919
  fz_catch(ctx)
920
  {
921
    fz_drop_band_writer(ctx, &writer->super);
922
    fz_throw(ctx, FZ_ERROR_GENERIC, "OCR initialisation failed");
923
  }
924
925
  return &writer->super;
926
#endif
927
0
}
928
929
void
930
fz_pdfocr_band_writer_set_progress(fz_context *ctx, fz_band_writer *writer_, fz_pdfocr_progress_fn *progress, void *progress_arg)
931
0
{
932
0
#ifdef OCR_DISABLED
933
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
934
#else
935
  pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
936
  if (writer == NULL)
937
    return;
938
  if (writer->super.header != pdfocr_write_header)
939
    fz_throw(ctx, FZ_ERROR_GENERIC, "Not a pdfocr band writer!");
940
941
  writer->progress = progress;
942
  writer->progress_arg = progress_arg;
943
#endif
944
0
}
945
946
void
947
fz_save_pixmap_as_pdfocr(fz_context *ctx, fz_pixmap *pixmap, char *filename, int append, const fz_pdfocr_options *pdfocr)
948
0
{
949
0
#ifdef OCR_DISABLED
950
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
951
#else
952
  fz_output *out = fz_new_output_with_path(ctx, filename, append);
953
  fz_try(ctx)
954
  {
955
    fz_write_pixmap_as_pdfocr(ctx, out, pixmap, pdfocr);
956
    fz_close_output(ctx, out);
957
  }
958
  fz_always(ctx)
959
    fz_drop_output(ctx, out);
960
  fz_catch(ctx)
961
    fz_rethrow(ctx);
962
#endif
963
0
}
964
965
/* High-level document writer interface */
966
967
#ifndef OCR_DISABLED
968
typedef struct
969
{
970
  fz_document_writer super;
971
  fz_draw_options draw;
972
  fz_pdfocr_options pdfocr;
973
  fz_pixmap *pixmap;
974
  fz_band_writer *bander;
975
  fz_output *out;
976
  int pagenum;
977
} fz_pdfocr_writer;
978
979
static fz_device *
980
pdfocr_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
981
{
982
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
983
  return fz_new_draw_device_with_options(ctx, &wri->draw, mediabox, &wri->pixmap);
984
}
985
986
static void
987
pdfocr_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
988
{
989
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
990
  fz_pixmap *pix = wri->pixmap;
991
992
  fz_try(ctx)
993
  {
994
    fz_close_device(ctx, dev);
995
    fz_write_header(ctx, wri->bander, pix->w, pix->h, pix->n, pix->alpha, pix->xres, pix->yres, wri->pagenum++, pix->colorspace, pix->seps);
996
    fz_write_band(ctx, wri->bander, pix->stride, pix->h, pix->samples);
997
  }
998
  fz_always(ctx)
999
  {
1000
    fz_drop_device(ctx, dev);
1001
    fz_drop_pixmap(ctx, pix);
1002
    wri->pixmap = NULL;
1003
  }
1004
  fz_catch(ctx)
1005
    fz_rethrow(ctx);
1006
}
1007
1008
static void
1009
pdfocr_close_writer(fz_context *ctx, fz_document_writer *wri_)
1010
{
1011
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
1012
1013
  fz_close_band_writer(ctx, wri->bander);
1014
  fz_close_output(ctx, wri->out);
1015
}
1016
1017
static void
1018
pdfocr_drop_writer(fz_context *ctx, fz_document_writer *wri_)
1019
{
1020
  fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
1021
1022
  fz_drop_pixmap(ctx, wri->pixmap);
1023
  fz_drop_band_writer(ctx, wri->bander);
1024
  fz_drop_output(ctx, wri->out);
1025
}
1026
#endif
1027
1028
fz_document_writer *
1029
fz_new_pdfocr_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
1030
0
{
1031
0
#ifdef OCR_DISABLED
1032
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
1033
#else
1034
  fz_pdfocr_writer *wri = NULL;
1035
1036
  fz_var(wri);
1037
1038
  fz_try(ctx)
1039
  {
1040
    wri = fz_new_derived_document_writer(ctx, fz_pdfocr_writer, pdfocr_begin_page, pdfocr_end_page, pdfocr_close_writer, pdfocr_drop_writer);
1041
    fz_parse_draw_options(ctx, &wri->draw, options);
1042
    fz_parse_pdfocr_options(ctx, &wri->pdfocr, options);
1043
    wri->out = out;
1044
    wri->bander = fz_new_pdfocr_band_writer(ctx, wri->out, &wri->pdfocr);
1045
  }
1046
  fz_catch(ctx)
1047
  {
1048
    fz_drop_output(ctx, out);
1049
    fz_free(ctx, wri);
1050
    fz_rethrow(ctx);
1051
  }
1052
1053
  return (fz_document_writer*)wri;
1054
#endif
1055
0
}
1056
1057
fz_document_writer *
1058
fz_new_pdfocr_writer(fz_context *ctx, const char *path, const char *options)
1059
0
{
1060
0
#ifdef OCR_DISABLED
1061
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
1062
#else
1063
  fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.pdfocr", 0);
1064
  return fz_new_pdfocr_writer_with_output(ctx, out, options);
1065
#endif
1066
0
}
1067
1068
void
1069
fz_pdfocr_writer_set_progress(fz_context *ctx, fz_document_writer *writer, fz_pdfocr_progress_fn *progress, void *progress_arg)
1070
0
{
1071
0
#ifdef OCR_DISABLED
1072
0
  fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
1073
#else
1074
  fz_pdfocr_writer *wri = (fz_pdfocr_writer *)writer;
1075
  if (!writer)
1076
    return;
1077
  if (writer->begin_page != pdfocr_begin_page)
1078
    fz_throw(ctx, FZ_ERROR_GENERIC, "Not a pdfocr writer!");
1079
  fz_pdfocr_band_writer_set_progress(ctx, wri->bander, progress, progress_arg);
1080
#endif
1081
0
}