Coverage Report

Created: 2025-07-23 06:37

/src/mupdf/source/fitz/stext-output.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2025 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
25
#define SUBSCRIPT_OFFSET 0.2f
26
#define SUPERSCRIPT_OFFSET -0.2f
27
28
#include <ft2build.h>
29
#include FT_FREETYPE_H
30
31
// Text black color when converted from DeviceCMYK to RGB
32
0
#define CMYK_BLACK 0x221f1f
33
34
static void
35
scale_run(fz_context *ctx, fz_stext_block *block, float scale)
36
0
{
37
0
  fz_matrix m = fz_scale(scale, scale);
38
0
  fz_stext_line *line;
39
0
  fz_stext_char *ch;
40
41
0
  while (block)
42
0
  {
43
0
    block->bbox = fz_transform_rect(block->bbox, m);
44
0
    switch (block->type)
45
0
    {
46
0
    case FZ_STEXT_BLOCK_TEXT:
47
0
      for (line = block->u.t.first_line; line; line = line->next)
48
0
      {
49
0
        line->bbox = fz_transform_rect(block->bbox, m);
50
0
        for (ch = line->first_char; ch; ch = ch->next)
51
0
        {
52
0
          ch->origin = fz_transform_point(ch->origin, m);
53
0
          ch->quad = fz_transform_quad(ch->quad, m);
54
0
          ch->size = ch->size * scale;
55
0
        }
56
0
      }
57
0
      break;
58
59
0
    case FZ_STEXT_BLOCK_IMAGE:
60
0
      block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale);
61
0
      break;
62
63
0
    case FZ_STEXT_BLOCK_STRUCT:
64
0
      if (block->u.s.down)
65
0
        scale_run(ctx, block->u.s.down->first_block, scale);
66
0
      break;
67
0
    }
68
0
    block = block->next;
69
0
  }
70
0
}
71
72
static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale)
73
0
{
74
0
  scale_run(ctx, page->first_block, scale);
75
0
}
76
77
/* HTML output (visual formatting with preserved layout) */
78
79
static int
80
detect_super_script(fz_stext_line *line, fz_stext_char *ch)
81
0
{
82
0
  if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
83
0
    return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
84
0
  return 0;
85
0
}
86
87
static const char *
88
font_full_name(fz_context *ctx, fz_font *font)
89
0
{
90
0
  const char *name = fz_font_name(ctx, font);
91
0
  const char *s = strchr(name, '+');
92
0
  return s ? s + 1 : name;
93
0
}
94
95
static const char *
96
html_clean_font_name(const char *fontname)
97
0
{
98
0
  if (strstr(fontname, "Times"))
99
0
    return "Times New Roman";
100
0
  if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica"))
101
0
  {
102
0
    if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed"))
103
0
      return "Arial Narrow";
104
0
    return "Arial";
105
0
  }
106
0
  if (strstr(fontname, "Courier"))
107
0
    return "Courier";
108
0
  return fontname;
109
0
}
110
111
static void
112
font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
113
0
{
114
0
  const char *name = html_clean_font_name(font_full_name(ctx, font));
115
0
  char *s;
116
0
  fz_strlcpy(buf, name, size);
117
0
  s = strrchr(buf, '-');
118
0
  if (s)
119
0
    *s = 0;
120
0
  if (is_mono)
121
0
    fz_strlcat(buf, ",monospace", size);
122
0
  else
123
0
    fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
124
0
}
125
126
static void
127
fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
128
0
{
129
0
  char family[80];
130
131
0
  int is_bold = fz_font_is_bold(ctx, font);
132
0
  int is_italic = fz_font_is_italic(ctx, font);
133
0
  int is_serif = fz_font_is_serif(ctx, font);
134
0
  int is_mono = fz_font_is_monospaced(ctx, font);
135
136
0
  font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
137
138
0
  if (sup) fz_write_string(ctx, out, "<sup>");
139
0
  if (is_mono) fz_write_string(ctx, out, "<tt>");
140
0
  if (is_bold) fz_write_string(ctx, out, "<b>");
141
0
  if (is_italic) fz_write_string(ctx, out, "<i>");
142
0
  fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size);
143
0
  if (color != 0 && color != CMYK_BLACK)
144
0
    fz_write_printf(ctx, out, ";color:#%06x", color & 0xffffff);
145
0
  fz_write_printf(ctx, out, "\">");
146
0
}
147
148
static void
149
fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
150
0
{
151
0
  int is_mono = fz_font_is_monospaced(ctx, font);
152
0
  int is_bold = fz_font_is_bold(ctx,font);
153
0
  int is_italic = fz_font_is_italic(ctx, font);
154
155
0
  fz_write_string(ctx, out, "</span>");
156
0
  if (is_italic) fz_write_string(ctx, out, "</i>");
157
0
  if (is_bold) fz_write_string(ctx, out, "</b>");
158
0
  if (is_mono) fz_write_string(ctx, out, "</tt>");
159
0
  if (sup) fz_write_string(ctx, out, "</sup>");
160
0
}
161
162
static void
163
fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
164
0
{
165
0
  fz_matrix ctm = block->u.i.transform;
166
167
0
#define USE_CSS_MATRIX_TRANSFORMS
168
0
#ifdef USE_CSS_MATRIX_TRANSFORMS
169
  /* Matrix maths notes.
170
   * When we get here ctm maps the unit square to the position in device
171
   * space occupied by the image.
172
   *
173
   * That is to say that mapping the 4 corners of the unit square through
174
   * the transform, give us the 4 target corners. We extend the corners
175
   * by adding an extra '1' into them to allow transforms to work. Thus
176
   * (x,y) maps through ctm = (a b c d e f) as:
177
   *
178
   * (x y 1) (a b 0) = (X Y 1)
179
   *         (c d 0)
180
   *         (e f 1)
181
   *
182
   * To simplify reading of matrix maths, we use the trick where we
183
   * 'drop' the first matrix down the page. Thus the corners c0=(0,0),
184
   * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively:
185
   *
186
   *         (    a     b 0)
187
   *         (    c     d 0)
188
   *         (    e     f 1)
189
   * (0 0 1) (    e     f 1)
190
   * (0 1 1) (  c+e   d+f 1)
191
   * (1 1 1) (a+c+e b+d+f 1)
192
   * (1 0 1) (  a+e   b+f 1)
193
   *
194
   * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f)
195
   *
196
   * Unfortunately, the CSS matrix transform, does not map the unit square.
197
   * Rather it does something moderately mad. As far as I can work out, the
198
   * top left corner of a (0,0) -> (w, h) box is transformed using the .e
199
   * and .f entries of the matrix. Then the image from within that square
200
   * is transformed using the centre of that square as the origin.
201
   *
202
   * So, an image placed at (0,0) in destination space with 1:1 transform
203
   * will result in an image a (0,0) as you'd expect. But an image at (0,0)
204
   * with a scale of 2, will result in 25% of the image off the left of the
205
   * screen, and 25% off the top.
206
   *
207
   * Accordingly, we have to adjust the ctm in several steps.
208
   */
209
  /* Move to moving the centre of the image. */
210
0
  ctm.e += (ctm.a+ctm.c)/2;
211
0
  ctm.f += (ctm.b+ctm.d)/2;
212
  /* Move from transforming the unit square to w/h */
213
0
  ctm.a /= block->u.i.image->w;
214
0
  ctm.b /= block->u.i.image->w;
215
0
  ctm.c /= block->u.i.image->h;
216
0
  ctm.d /= block->u.i.image->h;
217
  /* Move from points to pixels */
218
0
  ctm.a *= 96.0f/72;
219
0
  ctm.b *= 96.0f/72;
220
0
  ctm.c *= 96.0f/72;
221
0
  ctm.d *= 96.0f/72;
222
0
  ctm.e *= 96.0f/72;
223
0
  ctm.f *= 96.0f/72;
224
  /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */
225
0
  ctm.e -= block->u.i.image->w/2;
226
0
  ctm.f -= block->u.i.image->h/2;
227
228
0
  fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"",
229
0
    ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f);
230
#else
231
  /* Alternative version of the code that uses scaleX/Y and rotate
232
   * instead, but only copes with axis aligned cases. */
233
  int t;
234
235
  int x = block->bbox.x0;
236
  int y = block->bbox.y0;
237
  int w = block->bbox.x1 - block->bbox.x0;
238
  int h = block->bbox.y1 - block->bbox.y0;
239
240
  const char *flip = "";
241
242
  if (ctm.b == 0 && ctm.c == 0)
243
  {
244
    if (ctm.a < 0 && ctm.d < 0)
245
      flip = "transform: scaleX(-1) scaleY(-1);";
246
    else if (ctm.a < 0)
247
    {
248
      flip = "transform: scaleX(-1);";
249
    }
250
    else if (ctm.d < 0)
251
    {
252
      flip = "transform: scaleY(-1);";
253
    }
254
  } else if (ctm.a == 0 && ctm.d == 0) {
255
    if (ctm.b < 0 && ctm.c < 0)
256
    {
257
      flip = "transform: scaleY(-1) rotate(90deg);";
258
      x += (w-h)/2;
259
      y -= (w-h)/2;
260
      t = w; w = h; h = t;
261
    }
262
    else if (ctm.b < 0)
263
    {
264
      flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);";
265
      x += (w-h)/2;
266
      y -= (w-h)/2;
267
      t = w; w = h; h = t;
268
    }
269
    else if (ctm.c < 0)
270
    {
271
      flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);";
272
      x += (w-h)/2;
273
      y -= (w-h)/2;
274
      t = w; w = h; h = t;
275
    }
276
    else
277
    {
278
      flip = "transform: scaleY(-1) rotate(270deg);";
279
      x += (w-h)/2;
280
      y -= (w-h)/2;
281
      t = w; w = h; h = t;
282
    }
283
  }
284
285
  fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h);
286
#endif
287
0
  fz_write_image_as_data_uri(ctx, out, block->u.i.image);
288
0
  fz_write_string(ctx, out, "\">\n");
289
0
}
290
291
void
292
fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
293
0
{
294
0
  fz_stext_line *line;
295
0
  fz_stext_char *ch;
296
0
  float x, y, h;
297
298
0
  fz_font *font = NULL;
299
0
  float size = 0;
300
0
  int sup = 0;
301
0
  uint32_t color = 0;
302
303
0
  for (line = block->u.t.first_line; line; line = line->next)
304
0
  {
305
0
    x = line->bbox.x0;
306
0
    y = line->bbox.y0;
307
0
    h = line->bbox.y1 - line->bbox.y0;
308
309
0
    if (line->first_char)
310
0
    {
311
0
      h = line->first_char->size;
312
0
      y = line->first_char->origin.y - h * 0.8f;
313
0
    }
314
315
0
    fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h);
316
0
    font = NULL;
317
318
0
    for (ch = line->first_char; ch; ch = ch->next)
319
0
    {
320
0
      int ch_sup = detect_super_script(line, ch);
321
0
      if (ch->font != font || ch->size != size || ch_sup != sup || ch->argb != color)
322
0
      {
323
0
        if (font)
324
0
          fz_print_style_end_html(ctx, out, font, size, sup, color);
325
0
        font = ch->font;
326
0
        size = ch->size;
327
0
        color = ch->argb;
328
0
        sup = ch_sup;
329
0
        fz_print_style_begin_html(ctx, out, font, size, sup, color);
330
0
      }
331
332
0
      switch (ch->c)
333
0
      {
334
0
      default:
335
0
        if (ch->c >= 32 && ch->c <= 127)
336
0
          fz_write_byte(ctx, out, ch->c);
337
0
        else
338
0
          fz_write_printf(ctx, out, "&#x%x;", ch->c);
339
0
        break;
340
0
      case '<': fz_write_string(ctx, out, "&lt;"); break;
341
0
      case '>': fz_write_string(ctx, out, "&gt;"); break;
342
0
      case '&': fz_write_string(ctx, out, "&amp;"); break;
343
0
      case '"': fz_write_string(ctx, out, "&quot;"); break;
344
0
      case '\'': fz_write_string(ctx, out, "&apos;"); break;
345
0
      }
346
0
    }
347
348
0
    if (font)
349
0
      fz_print_style_end_html(ctx, out, font, size, sup, color);
350
351
0
    fz_write_string(ctx, out, "</p>\n");
352
0
  }
353
0
}
354
355
static const char *
356
html_tag_for_struct(fz_stext_struct *s)
357
0
{
358
0
  const char *raw;
359
360
0
  if (s == NULL)
361
0
    return "DIV";
362
363
0
  raw = s->raw;
364
0
  if (raw == NULL)
365
0
    raw = fz_structure_to_string(s->standard);
366
367
0
  if (!fz_strcasecmp(raw, "blockquote"))
368
0
    return "blockquote";
369
0
  if (!fz_strcasecmp(raw, "title"))
370
0
    return "h1";
371
0
  if (!fz_strcasecmp(raw, "sub"))
372
0
    return "sub";
373
0
  if (!fz_strcasecmp(raw, "p"))
374
0
    return "p";
375
0
  if (!fz_strcasecmp(raw, "h"))
376
0
    return "h1"; /* Pick one! */
377
0
  if (!fz_strcasecmp(raw, "h1"))
378
0
    return "h1";
379
0
  if (!fz_strcasecmp(raw, "h2"))
380
0
    return "h2";
381
0
  if (!fz_strcasecmp(raw, "h3"))
382
0
    return "h3";
383
0
  if (!fz_strcasecmp(raw, "h4"))
384
0
    return "h4";
385
0
  if (!fz_strcasecmp(raw, "h5"))
386
0
    return "h5";
387
0
  if (!fz_strcasecmp(raw, "h6"))
388
0
    return "h6";
389
390
0
  if (!fz_strcasecmp(raw, "list"))
391
0
    return "ul";
392
0
  if (!fz_strcasecmp(raw, "listitem"))
393
0
    return "li";
394
0
  if (!fz_strcasecmp(raw, "table"))
395
0
    return "table";
396
0
  if (!fz_strcasecmp(raw, "tr"))
397
0
    return "tr";
398
0
  if (!fz_strcasecmp(raw, "th"))
399
0
    return "th";
400
0
  if (!fz_strcasecmp(raw, "td"))
401
0
    return "td";
402
0
  if (!fz_strcasecmp(raw, "thead"))
403
0
    return "thead";
404
0
  if (!fz_strcasecmp(raw, "tbody"))
405
0
    return "tbody";
406
0
  if (!fz_strcasecmp(raw, "tfoot"))
407
0
    return "tfoot";
408
409
0
  if (!fz_strcasecmp(raw, "span"))
410
0
    return "span";
411
0
  if (!fz_strcasecmp(raw, "code"))
412
0
    return "code";
413
0
  if (!fz_strcasecmp(raw, "em"))
414
0
    return "em";
415
0
  if (!fz_strcasecmp(raw, "strong"))
416
0
    return "strong";
417
418
0
  return "div";
419
0
}
420
421
static void
422
print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block);
423
424
static void
425
fz_print_stext_struct_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
426
0
{
427
0
  const char *tag;
428
429
0
  if (block->u.s.down == NULL)
430
0
    return;
431
432
0
  tag = html_tag_for_struct(block->u.s.down);
433
434
0
  fz_write_printf(ctx, out, "<%s>\n", tag);
435
436
0
  print_blocks_as_html(ctx, out, block->u.s.down->first_block);
437
438
0
  fz_write_printf(ctx, out, "</%s>\n", tag);
439
0
}
440
441
static void
442
print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
443
0
{
444
0
  for (; block; block = block->next)
445
0
  {
446
0
    if (block->type == FZ_STEXT_BLOCK_IMAGE)
447
0
      fz_print_stext_image_as_html(ctx, out, block);
448
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
449
0
      fz_print_stext_block_as_html(ctx, out, block);
450
0
    else if (block->type == FZ_STEXT_BLOCK_STRUCT)
451
0
      fz_print_stext_struct_as_html(ctx, out, block);
452
0
  }
453
0
}
454
455
void
456
fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
457
0
{
458
0
  float w = page->mediabox.x1 - page->mediabox.x0;
459
0
  float h = page->mediabox.y1 - page->mediabox.y0;
460
461
0
  fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h);
462
463
0
  print_blocks_as_html(ctx, out, page->first_block);
464
465
0
  fz_write_string(ctx, out, "</div>\n");
466
0
}
467
468
void
469
fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
470
0
{
471
0
  fz_write_string(ctx, out, "<!DOCTYPE html>\n");
472
0
  fz_write_string(ctx, out, "<html>\n");
473
0
  fz_write_string(ctx, out, "<head>\n");
474
0
  fz_write_string(ctx, out, "<style>\n");
475
0
  fz_write_string(ctx, out, "body{background-color:slategray}\n");
476
0
  fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n");
477
0
  fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n");
478
0
  fz_write_string(ctx, out, "</style>\n");
479
0
  fz_write_string(ctx, out, "</head>\n");
480
0
  fz_write_string(ctx, out, "<body>\n");
481
0
}
482
483
void
484
fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
485
0
{
486
0
  fz_write_string(ctx, out, "</body>\n");
487
0
  fz_write_string(ctx, out, "</html>\n");
488
0
}
489
490
/* XHTML output (semantic, little layout, suitable for reflow) */
491
492
static void
493
find_table_pos(fz_stext_grid_positions *xs, float x0, float x1, int *ix0, int *ix1)
494
0
{
495
0
  int i;
496
497
0
  *ix0 = -1;
498
0
  *ix1 = -1;
499
500
0
  for (i = 1; i < xs->len; i++)
501
0
    if (x0 < xs->list[i].pos)
502
0
    {
503
0
      *ix0 = i-1;
504
0
      break;
505
0
    }
506
0
  for (; i < xs->len; i++)
507
0
    if (x1 < xs->list[i].pos)
508
0
    {
509
0
      *ix1 = i-1;
510
0
      break;
511
0
    }
512
0
  if (i == xs->len)
513
0
    *ix1 = i-1;
514
0
}
515
516
static void
517
run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out);
518
519
static void
520
fz_print_stext_table_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
521
0
{
522
0
  fz_stext_block *grid, *tr, *td;
523
0
  int w, h;
524
0
  int x, y;
525
0
  uint8_t *cells;
526
0
  int malformed = 0;
527
528
0
  for (grid = block; grid != NULL; grid = grid->next)
529
0
    if (grid->type == FZ_STEXT_BLOCK_GRID)
530
0
      break;
531
0
  if (grid == NULL)
532
0
  {
533
0
    fz_warn(ctx, "Malformed table data");
534
0
    return;
535
0
  }
536
0
  w = grid->u.b.xs->len;
537
0
  h = grid->u.b.ys->len;
538
0
  cells = fz_calloc(ctx, w, h);
539
540
0
  fz_try(ctx)
541
0
  {
542
0
    fz_write_printf(ctx, out, "<table>\n");
543
544
0
    y = 0;
545
0
    for (tr = grid->next; tr != NULL; tr = tr->next)
546
0
    {
547
0
      if (tr->type != FZ_STEXT_BLOCK_STRUCT || tr->u.s.down == NULL || tr->u.s.down->standard != FZ_STRUCTURE_TR)
548
0
      {
549
0
        malformed = 1;
550
0
        continue;
551
0
      }
552
0
      fz_write_printf(ctx, out, "<tr>\n");
553
0
      x = 0;
554
0
      for (td = tr->u.s.down->first_block; td != NULL; td = td->next)
555
0
      {
556
0
        int x0, y0, x1, y1;
557
0
        if (td->type != FZ_STEXT_BLOCK_STRUCT || td->u.s.down == NULL || td->u.s.down->standard != FZ_STRUCTURE_TD)
558
0
        {
559
0
          malformed = 1;
560
0
          continue;
561
0
        }
562
0
        find_table_pos(grid->u.b.xs, td->bbox.x0, td->bbox.x1, &x0, &x1);
563
0
        find_table_pos(grid->u.b.ys, td->bbox.y0, td->bbox.y1, &y0, &y1);
564
0
        if (x0 < 0 || x1 < 0 || x1 >= w)
565
0
        {
566
0
          malformed = 1;
567
0
          x0 = x;
568
0
          x1 = x+1;
569
0
        }
570
0
        if (y0 < 0 || y1 < 0 || y1 >= h)
571
0
        {
572
0
          malformed = 1;
573
0
          y0 = y;
574
0
          y1 = y+1;
575
0
        }
576
0
        if (y < y0)
577
0
        {
578
0
          malformed = 1;
579
0
          continue;
580
0
        }
581
0
        if (x > x0)
582
0
        {
583
0
          malformed = 1;
584
0
        }
585
0
        while (x < x0)
586
0
        {
587
0
          uint8_t *c = &cells[x + w*y];
588
0
          if (*c == 0)
589
0
          {
590
0
            fz_write_printf(ctx, out, "<td></td>");
591
0
            *c = 1;
592
0
          }
593
0
          x++;
594
0
        }
595
0
        fz_write_string(ctx, out, "<td");
596
0
        if (x1 > x0+1)
597
0
          fz_write_printf(ctx, out, " rowspan=%d", x1-x0);
598
0
        if (y1 > y0+1)
599
0
          fz_write_printf(ctx, out, " colspan=%d", y1-y0);
600
0
        fz_write_string(ctx, out, ">\n");
601
0
        run_to_xhtml(ctx, td->u.s.down->first_block, out);
602
0
        fz_write_printf(ctx, out, "</td>\n");
603
0
        for ( ; y0 < y1; y0++)
604
0
          for (x = x0; x < x1; x++)
605
0
          {
606
0
            uint8_t *c = &cells[x + w*y0];
607
0
            if (*c != 0)
608
0
              malformed = 1;
609
0
            *c = 1;
610
0
          }
611
0
      }
612
0
      fz_write_printf(ctx, out, "</tr>\n");
613
0
      y++;
614
0
    }
615
616
0
    fz_write_printf(ctx, out, "</table>\n");
617
0
  }
618
0
  fz_always(ctx)
619
0
    fz_free(ctx, cells);
620
0
  fz_catch(ctx)
621
0
    fz_rethrow(ctx);
622
623
0
  if (malformed)
624
0
    fz_warn(ctx, "Malformed table data");
625
0
}
626
627
static void
628
fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
629
0
{
630
0
  int w = block->bbox.x1 - block->bbox.x0;
631
0
  int h = block->bbox.y1 - block->bbox.y0;
632
633
0
  fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
634
0
  fz_write_image_as_data_uri(ctx, out, block->u.i.image);
635
0
  fz_write_string(ctx, out, "\"/></p>\n");
636
0
}
637
638
static void
639
fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
640
0
{
641
0
  int is_mono = fz_font_is_monospaced(ctx, font);
642
0
  int is_bold = fz_font_is_bold(ctx, font);
643
0
  int is_italic = fz_font_is_italic(ctx, font);
644
645
0
  if (sup)
646
0
    fz_write_string(ctx, out, "<sup>");
647
0
  if (is_mono)
648
0
    fz_write_string(ctx, out, "<tt>");
649
0
  if (is_bold)
650
0
    fz_write_string(ctx, out, "<b>");
651
0
  if (is_italic)
652
0
    fz_write_string(ctx, out, "<i>");
653
0
}
654
655
static void
656
fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
657
0
{
658
0
  int is_mono = fz_font_is_monospaced(ctx, font);
659
0
  int is_bold = fz_font_is_bold(ctx, font);
660
0
  int is_italic = fz_font_is_italic(ctx, font);
661
662
0
  if (is_italic)
663
0
    fz_write_string(ctx, out, "</i>");
664
0
  if (is_bold)
665
0
    fz_write_string(ctx, out, "</b>");
666
0
  if (is_mono)
667
0
    fz_write_string(ctx, out, "</tt>");
668
0
  if (sup)
669
0
    fz_write_string(ctx, out, "</sup>");
670
0
}
671
672
static float avg_font_size_of_line(fz_stext_char *ch)
673
0
{
674
0
  float size = 0;
675
0
  int n = 0;
676
0
  if (!ch)
677
0
    return 0;
678
0
  while (ch)
679
0
  {
680
0
    size += ch->size;
681
0
    ++n;
682
0
    ch = ch->next;
683
0
  }
684
0
  return size / n;
685
0
}
686
687
static const char *tag_from_font_size(float size)
688
0
{
689
0
  if (size >= 20) return "h1";
690
0
  if (size >= 15) return "h2";
691
0
  if (size >= 12) return "h3";
692
0
  return "p";
693
0
}
694
695
static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
696
0
{
697
0
  fz_stext_line *line;
698
0
  fz_stext_char *ch;
699
700
0
  fz_font *font = NULL;
701
0
  int sup = 0;
702
0
  int sp = 1;
703
0
  const char *tag = NULL;
704
0
  const char *new_tag;
705
706
0
  for (line = block->u.t.first_line; line; line = line->next)
707
0
  {
708
0
    new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
709
0
    if (tag != new_tag)
710
0
    {
711
0
      if (tag)
712
0
      {
713
0
        if (font)
714
0
          fz_print_style_end_xhtml(ctx, out, font, sup);
715
0
        fz_write_printf(ctx, out, "</%s>", tag);
716
0
      }
717
0
      tag = new_tag;
718
0
      fz_write_printf(ctx, out, "<%s>", tag);
719
0
      if (font)
720
0
        fz_print_style_begin_xhtml(ctx, out, font, sup);
721
0
    }
722
723
0
    if (!sp)
724
0
      fz_write_byte(ctx, out, ' ');
725
726
0
    for (ch = line->first_char; ch; ch = ch->next)
727
0
    {
728
0
      int ch_sup = detect_super_script(line, ch);
729
0
      if (ch->font != font || ch_sup != sup)
730
0
      {
731
0
        if (font)
732
0
          fz_print_style_end_xhtml(ctx, out, font, sup);
733
0
        font = ch->font;
734
0
        sup = ch_sup;
735
0
        fz_print_style_begin_xhtml(ctx, out, font, sup);
736
0
      }
737
738
0
      sp = (ch->c == ' ');
739
0
      switch (ch->c)
740
0
      {
741
0
      default:
742
0
        if (ch->c >= 32 && ch->c <= 127)
743
0
          fz_write_byte(ctx, out, ch->c);
744
0
        else
745
0
          fz_write_printf(ctx, out, "&#x%x;", ch->c);
746
0
        break;
747
0
      case '<': fz_write_string(ctx, out, "&lt;"); break;
748
0
      case '>': fz_write_string(ctx, out, "&gt;"); break;
749
0
      case '&': fz_write_string(ctx, out, "&amp;"); break;
750
0
      case '"': fz_write_string(ctx, out, "&quot;"); break;
751
0
      case '\'': fz_write_string(ctx, out, "&apos;"); break;
752
0
      }
753
0
    }
754
0
  }
755
756
0
  if (font)
757
0
    fz_print_style_end_xhtml(ctx, out, font, sup);
758
0
  fz_write_printf(ctx, out, "</%s>\n", tag);
759
0
}
760
761
static void
762
fz_print_struct_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
763
0
{
764
0
  const char *tag;
765
766
0
  if (block->u.s.down == NULL)
767
0
    return;
768
769
0
  if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
770
0
  {
771
0
    fz_print_stext_table_as_xhtml(ctx, out, block->u.s.down->first_block);
772
0
    return;
773
0
  }
774
775
0
  tag = html_tag_for_struct(block->u.s.down);
776
777
0
  fz_write_printf(ctx, out, "<%s>\n", tag);
778
779
0
  run_to_xhtml(ctx, block->u.s.down->first_block, out);
780
781
0
  fz_write_printf(ctx, out, "</%s>\n", tag);
782
0
}
783
784
static void
785
run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out)
786
0
{
787
0
  while (block)
788
0
  {
789
0
    switch(block->type)
790
0
    {
791
0
    case FZ_STEXT_BLOCK_IMAGE:
792
0
      fz_print_stext_image_as_xhtml(ctx, out, block);
793
0
      break;
794
0
    case FZ_STEXT_BLOCK_TEXT:
795
0
      fz_print_stext_block_as_xhtml(ctx, out, block);
796
0
      break;
797
0
    case FZ_STEXT_BLOCK_STRUCT:
798
0
      fz_print_struct_as_xhtml(ctx, out, block);
799
0
      break;
800
0
    }
801
0
    block = block->next;
802
0
  }
803
0
}
804
805
void
806
fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
807
0
{
808
0
  fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
809
810
0
  run_to_xhtml(ctx, page->first_block, out);
811
812
0
  fz_write_string(ctx, out, "</div>\n");
813
0
}
814
815
void
816
fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
817
0
{
818
0
  fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
819
0
  fz_write_string(ctx, out, "<!DOCTYPE html");
820
0
  fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
821
0
  fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
822
0
  fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
823
0
  fz_write_string(ctx, out, "<head>\n");
824
0
  fz_write_string(ctx, out, "<style>\n");
825
0
  fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
826
0
  fz_write_string(ctx, out, "</style>\n");
827
0
  fz_write_string(ctx, out, "</head>\n");
828
0
  fz_write_string(ctx, out, "<body>\n");
829
0
}
830
831
void
832
fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
833
0
{
834
0
  fz_write_string(ctx, out, "</body>\n");
835
0
  fz_write_string(ctx, out, "</html>\n");
836
0
}
837
838
/* Detailed XML dump of the entire structured text data */
839
840
static void
841
xml_write_char(fz_context *ctx, fz_output *out, int c)
842
0
{
843
0
  switch (c)
844
0
  {
845
0
  case '<': fz_write_string(ctx, out, "&lt;"); break;
846
0
  case '>': fz_write_string(ctx, out, "&gt;"); break;
847
0
  case '&': fz_write_string(ctx, out, "&amp;"); break;
848
0
  case '"': fz_write_string(ctx, out, "&quot;"); break;
849
0
  case '\'': fz_write_string(ctx, out, "&apos;"); break;
850
0
  default:
851
0
    if (c >= 32 && c <= 127)
852
0
      fz_write_printf(ctx, out, "%c", c);
853
0
    else
854
0
      fz_write_printf(ctx, out, "&#x%x;", c);
855
0
    break;
856
0
  }
857
0
}
858
859
static void
860
as_xml(fz_context *ctx, fz_stext_block *block, fz_output *out)
861
0
{
862
0
  fz_stext_line *line;
863
0
  fz_stext_char *ch;
864
0
  int i;
865
866
0
  while (block)
867
0
  {
868
0
    switch (block->type)
869
0
    {
870
0
    case FZ_STEXT_BLOCK_TEXT:
871
0
      fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\"",
872
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
873
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
874
0
        fz_write_printf(ctx, out, " justify=\"unknown\"");
875
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_LEFT)
876
0
        fz_write_printf(ctx, out, " justify=\"left\"");
877
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_CENTRE)
878
0
        fz_write_printf(ctx, out, " justify=\"centre\"");
879
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_RIGHT)
880
0
        fz_write_printf(ctx, out, " justify=\"right\"");
881
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
882
0
        fz_write_printf(ctx, out, " justify=\"full\"");
883
0
      fz_write_printf(ctx, out, ">\n");
884
0
      for (line = block->u.t.first_line; line; line = line->next)
885
0
      {
886
0
        fz_font *font = NULL;
887
0
        float size = 0;
888
0
        const char *name = NULL;
889
890
0
        fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\" flags=\"%d\"",
891
0
            line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
892
0
            line->wmode,
893
0
            line->dir.x, line->dir.y, line->flags);
894
895
        /* This is duplication of information, but it makes it MUCH easier to search for
896
         * text fragments in large output. */
897
0
        {
898
0
          int valid = 1;
899
0
          fz_write_printf(ctx, out, " text=\"");
900
0
          for (ch = line->first_char; ch; ch = ch->next)
901
0
          {
902
0
            if (valid)
903
0
              valid = fz_is_valid_xml_char(ch->c);
904
0
            xml_write_char(ctx, out, fz_range_limit_xml_char(ch->c));
905
0
          }
906
0
          if (!valid)
907
0
          {
908
0
            fz_write_printf(ctx, out, "\" hextext=\"");
909
0
            for (ch = line->first_char; ch; ch = ch->next)
910
0
            {
911
0
              char text[8];
912
0
              int n = fz_runetochar(text, ch->c);
913
0
              for (i = 0; i < n; i++)
914
0
                fz_write_printf(ctx, out, "%02x", text[i]);
915
0
            }
916
0
          }
917
0
          fz_write_printf(ctx, out, "\"");
918
0
        }
919
920
0
        fz_write_printf(ctx, out, ">\n");
921
922
0
        for (ch = line->first_char; ch; ch = ch->next)
923
0
        {
924
0
          if (ch->font != font || ch->size != size)
925
0
          {
926
0
            const char *s;
927
0
            if (font)
928
0
              fz_write_string(ctx, out, "</font>\n");
929
0
            font = ch->font;
930
0
            size = ch->size;
931
0
            s = name = font_full_name(ctx, font);
932
0
            while (*s)
933
0
            {
934
0
              int c = *s++;
935
0
              if (c < 32 || c >= 127)
936
0
                break;
937
0
            }
938
0
            if (*s)
939
0
              fz_write_printf(ctx, out, "<font hexname=%>", name);
940
0
            else
941
0
              fz_write_printf(ctx, out, "<font name=\"%s\"", name);
942
0
            fz_write_printf(ctx, out, " size=\"%g\">\n", size);
943
0
          }
944
0
          fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" alpha=\"#%02x\" flags=\"%d\" c=\"",
945
0
              ch->quad.ul.x, ch->quad.ul.y,
946
0
              ch->quad.ur.x, ch->quad.ur.y,
947
0
              ch->quad.ll.x, ch->quad.ll.y,
948
0
              ch->quad.lr.x, ch->quad.lr.y,
949
0
              ch->origin.x, ch->origin.y,
950
0
              ch->bidi,
951
0
              ch->argb & 0xFFFFFF,
952
0
              ch->argb>>24,
953
0
              ch->flags);
954
0
          xml_write_char(ctx, out, ch->c);
955
0
          if (!fz_is_valid_xml_char(ch->c))
956
0
          {
957
0
            char text[8];
958
0
            int n = fz_runetochar(text, ch->c);
959
0
            fz_write_string(ctx, out, "\" hexc=\"");
960
0
            for (i = 0; i < n; i++)
961
0
              fz_write_printf(ctx, out, "%02x", text[i]);
962
0
          }
963
0
          fz_write_string(ctx, out, "\"/>\n");
964
0
        }
965
966
0
        if (font)
967
0
          fz_write_string(ctx, out, "</font>\n");
968
969
0
        fz_write_string(ctx, out, "</line>\n");
970
0
      }
971
0
      fz_write_string(ctx, out, "</block>\n");
972
0
      break;
973
974
0
    case FZ_STEXT_BLOCK_IMAGE:
975
0
      fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
976
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
977
0
      break;
978
979
0
    case FZ_STEXT_BLOCK_STRUCT:
980
0
      fz_write_printf(ctx, out, "<struct idx=\"%d\" bbox=\"%g %g %g %g\"", block->u.s.index,
981
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
982
0
      if (block->u.s.down)
983
0
        fz_write_printf(ctx, out, " raw=\"%s\" std=\"%s\"",
984
0
            block->u.s.down->raw, fz_structure_to_string(block->u.s.down->standard));
985
0
      fz_write_printf(ctx, out, ">\n");
986
0
      if (block->u.s.down)
987
0
        as_xml(ctx, block->u.s.down->first_block, out);
988
0
      fz_write_printf(ctx, out, "</struct>\n");
989
0
      break;
990
991
0
    case FZ_STEXT_BLOCK_VECTOR:
992
0
      fz_write_printf(ctx, out, "<vector bbox=\"%g %g %g %g\" stroke=\"%d\" rectangle=\"%d\" continues=\"%d\" argb=\"%08x\"/>\n",
993
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1,
994
0
          !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED),
995
0
          !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE),
996
0
          !!(block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES),
997
0
          block->u.v.argb);
998
0
      break;
999
1000
0
    case FZ_STEXT_BLOCK_GRID:
1001
0
      fz_write_printf(ctx, out, "<grid xpos=\"");
1002
0
      for (i = 0; i < block->u.b.xs->len; i++)
1003
0
        fz_write_printf(ctx, out, "%g ", block->u.b.xs->list[i].pos);
1004
0
      fz_write_printf(ctx, out, "\" xuncertainty=\"");
1005
0
      for (i = 0; i < block->u.b.xs->len; i++)
1006
0
        fz_write_printf(ctx, out, "%d ", block->u.b.xs->list[i].uncertainty);
1007
0
      fz_write_printf(ctx, out, "\" xmaxuncertainty=\"%d\" ypos=\"", block->u.b.xs->max_uncertainty);
1008
0
      for (i = 0; i < block->u.b.ys->len; i++)
1009
0
        fz_write_printf(ctx, out, "%g ", block->u.b.ys->list[i].pos);
1010
0
      fz_write_printf(ctx, out, "\" yuncertainty=\"");
1011
0
      for (i = 0; i < block->u.b.ys->len; i++)
1012
0
        fz_write_printf(ctx, out, "%d ", block->u.b.ys->list[i].uncertainty);
1013
0
      fz_write_printf(ctx, out, "\" ymaxuncertainty=\"%d\" />\n", block->u.b.ys->max_uncertainty);
1014
0
      break;
1015
0
    }
1016
0
    block = block->next;
1017
0
  }
1018
0
}
1019
1020
void
1021
fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
1022
0
{
1023
0
  fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
1024
0
    page->mediabox.x1 - page->mediabox.x0,
1025
0
    page->mediabox.y1 - page->mediabox.y0);
1026
1027
0
  as_xml(ctx, page->first_block, out);
1028
1029
0
  fz_write_string(ctx, out, "</page>\n");
1030
0
}
1031
1032
/* JSON dump */
1033
1034
static void
1035
as_json(fz_context *ctx, fz_stext_block *block, fz_output *out, float scale)
1036
0
{
1037
0
  fz_stext_line *line;
1038
0
  fz_stext_char *ch;
1039
0
  int comma = 0;
1040
1041
0
  while (block)
1042
0
  {
1043
0
    if (comma)
1044
0
      fz_write_string(ctx, out, ",");
1045
0
    comma = 1;
1046
1047
0
    switch (block->type)
1048
0
    {
1049
0
    case FZ_STEXT_BLOCK_TEXT:
1050
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "text");
1051
0
      fz_write_printf(ctx, out, "%q:{", "bbox");
1052
0
      fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
1053
0
      fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
1054
0
      fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
1055
0
      fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
1056
0
      fz_write_printf(ctx, out, "%q:[", "lines");
1057
1058
0
      for (line = block->u.t.first_line; line; line = line->next)
1059
0
      {
1060
0
        if (line != block->u.t.first_line)
1061
0
          fz_write_string(ctx, out, ",");
1062
0
        fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode);
1063
0
        fz_write_printf(ctx, out, "%q:{", "bbox");
1064
0
        fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale));
1065
0
        fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale));
1066
0
        fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale));
1067
0
        fz_write_printf(ctx, out, "%q:%d,", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale));
1068
0
        fz_write_printf(ctx, out, "%q:%d},", "flags", line->flags);
1069
1070
        /* Since we force preserve-spans, the first char has the style for the entire line. */
1071
0
        if (line->first_char)
1072
0
        {
1073
0
          fz_font *font = line->first_char->font;
1074
0
          char *font_family = "sans-serif";
1075
0
          char *font_weight = "normal";
1076
0
          char *font_style = "normal";
1077
0
          if (fz_font_is_monospaced(ctx, font)) font_family = "monospace";
1078
0
          else if (fz_font_is_serif(ctx, font)) font_family = "serif";
1079
0
          if (fz_font_is_bold(ctx, font)) font_weight = "bold";
1080
0
          if (fz_font_is_italic(ctx, font)) font_style = "italic";
1081
0
          fz_write_printf(ctx, out, "%q:{", "font");
1082
0
          fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font));
1083
0
          fz_write_printf(ctx, out, "%q:%q,", "family", font_family);
1084
0
          fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight);
1085
0
          fz_write_printf(ctx, out, "%q:%q,", "style", font_style);
1086
0
          fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale));
1087
0
          fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale));
1088
0
          fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale));
1089
0
        }
1090
1091
0
        fz_write_printf(ctx, out, "%q:\"", "text");
1092
0
        for (ch = line->first_char; ch; ch = ch->next)
1093
0
        {
1094
0
          if (ch->c == '"' || ch->c == '\\')
1095
0
            fz_write_printf(ctx, out, "\\%c", ch->c);
1096
0
          else if (ch->c < 32)
1097
0
            fz_write_printf(ctx, out, "\\u%04x", ch->c);
1098
0
          else
1099
0
            fz_write_printf(ctx, out, "%C", ch->c);
1100
0
        }
1101
0
        fz_write_printf(ctx, out, "\"}");
1102
0
      }
1103
0
      fz_write_string(ctx, out, "]}");
1104
0
      break;
1105
1106
0
    case FZ_STEXT_BLOCK_IMAGE:
1107
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "image");
1108
0
      fz_write_printf(ctx, out, "%q:{", "bbox");
1109
0
      fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
1110
0
      fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
1111
0
      fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
1112
0
      fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
1113
0
      break;
1114
1115
0
    case FZ_STEXT_BLOCK_STRUCT:
1116
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "structure");
1117
0
      fz_write_printf(ctx, out, "%q:%d", "index", block->u.s.index);
1118
0
      if (block->u.s.down)
1119
0
      {
1120
0
        fz_write_printf(ctx, out, ",%q:%q", "raw", block->u.s.down->raw);
1121
0
        fz_write_printf(ctx, out, ",%q:%q", "std", fz_structure_to_string(block->u.s.down->standard));
1122
0
        fz_write_printf(ctx, out, ",%q:[", "contents");
1123
0
        as_json(ctx, block->u.s.down->first_block, out, scale);
1124
0
        fz_write_printf(ctx, out, "]");
1125
0
      }
1126
0
      fz_write_printf(ctx, out, "}");
1127
0
      break;
1128
1129
0
    }
1130
0
    block = block->next;
1131
0
  }
1132
0
}
1133
1134
void
1135
fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
1136
0
{
1137
0
  fz_write_printf(ctx, out, "{%q:[", "blocks");
1138
1139
0
  as_json(ctx, page->first_block, out, scale);
1140
1141
0
  fz_write_string(ctx, out, "]}");
1142
0
}
1143
1144
/* Plain text */
1145
1146
static void
1147
do_as_text(fz_context *ctx, fz_output *out, fz_stext_block *first_block)
1148
0
{
1149
0
  fz_stext_block *block;
1150
0
  fz_stext_line *line;
1151
0
  fz_stext_char *ch;
1152
0
  char utf[10];
1153
0
  int i, n;
1154
1155
0
  for (block = first_block; block; block = block->next)
1156
0
  {
1157
0
    switch (block->type)
1158
0
    {
1159
0
    case FZ_STEXT_BLOCK_TEXT:
1160
0
      for (line = block->u.t.first_line; line; line = line->next)
1161
0
      {
1162
0
        int break_line = 1;
1163
0
        for (ch = line->first_char; ch; ch = ch->next)
1164
0
        {
1165
0
          if (ch->next == NULL && (line->flags & FZ_STEXT_LINE_FLAGS_JOINED) != 0)
1166
0
          {
1167
0
            break_line = 0;
1168
0
            continue;
1169
0
          }
1170
0
          n = fz_runetochar(utf, ch->c);
1171
0
          for (i = 0; i < n; i++)
1172
0
            fz_write_byte(ctx, out, utf[i]);
1173
0
        }
1174
0
        if (break_line)
1175
0
          fz_write_string(ctx, out, "\n");
1176
0
      }
1177
0
      fz_write_string(ctx, out, "\n");
1178
0
      break;
1179
0
    case FZ_STEXT_BLOCK_STRUCT:
1180
0
      if (block->u.s.down != NULL)
1181
0
        do_as_text(ctx, out, block->u.s.down->first_block);
1182
0
      break;
1183
0
    }
1184
0
  }
1185
0
}
1186
1187
void
1188
fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
1189
0
{
1190
0
  do_as_text(ctx, out, page->first_block);
1191
0
}
1192
1193
/* Text output writer */
1194
1195
enum {
1196
  FZ_FORMAT_TEXT,
1197
  FZ_FORMAT_HTML,
1198
  FZ_FORMAT_XHTML,
1199
  FZ_FORMAT_STEXT_XML,
1200
  FZ_FORMAT_STEXT_JSON,
1201
};
1202
1203
typedef struct
1204
{
1205
  fz_document_writer super;
1206
  int format;
1207
  int number;
1208
  fz_stext_options opts;
1209
  fz_stext_page *page;
1210
  fz_output *out;
1211
} fz_text_writer;
1212
1213
static fz_device *
1214
text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
1215
0
{
1216
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1217
0
  float s = wri->opts.scale;
1218
1219
0
  if (wri->page)
1220
0
  {
1221
0
    fz_drop_stext_page(ctx, wri->page);
1222
0
    wri->page = NULL;
1223
0
  }
1224
1225
0
  wri->number++;
1226
1227
0
  wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s)));
1228
0
  return fz_new_stext_device(ctx, wri->page, &wri->opts);
1229
0
}
1230
1231
static void
1232
text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
1233
0
{
1234
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1235
0
  float s = wri->opts.scale;
1236
1237
0
  fz_scale_stext_page(ctx, wri->page, s);
1238
1239
0
  fz_try(ctx)
1240
0
  {
1241
0
    fz_close_device(ctx, dev);
1242
0
    switch (wri->format)
1243
0
    {
1244
0
    default:
1245
0
    case FZ_FORMAT_TEXT:
1246
0
      fz_print_stext_page_as_text(ctx, wri->out, wri->page);
1247
0
      break;
1248
0
    case FZ_FORMAT_HTML:
1249
0
      fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
1250
0
      break;
1251
0
    case FZ_FORMAT_XHTML:
1252
0
      fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
1253
0
      break;
1254
0
    case FZ_FORMAT_STEXT_XML:
1255
0
      fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
1256
0
      break;
1257
0
    case FZ_FORMAT_STEXT_JSON:
1258
0
      if (wri->number > 1)
1259
0
        fz_write_string(ctx, wri->out, ",");
1260
0
      fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1);
1261
0
      break;
1262
0
    }
1263
0
  }
1264
0
  fz_always(ctx)
1265
0
  {
1266
0
    fz_drop_device(ctx, dev);
1267
0
    fz_drop_stext_page(ctx, wri->page);
1268
0
    wri->page = NULL;
1269
0
  }
1270
0
  fz_catch(ctx)
1271
0
    fz_rethrow(ctx);
1272
0
}
1273
1274
static void
1275
text_close_writer(fz_context *ctx, fz_document_writer *wri_)
1276
0
{
1277
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1278
0
  switch (wri->format)
1279
0
  {
1280
0
  case FZ_FORMAT_HTML:
1281
0
    fz_print_stext_trailer_as_html(ctx, wri->out);
1282
0
    break;
1283
0
  case FZ_FORMAT_XHTML:
1284
0
    fz_print_stext_trailer_as_xhtml(ctx, wri->out);
1285
0
    break;
1286
0
  case FZ_FORMAT_STEXT_XML:
1287
0
    fz_write_string(ctx, wri->out, "</document>\n");
1288
0
    break;
1289
0
  case FZ_FORMAT_STEXT_JSON:
1290
0
    fz_write_string(ctx, wri->out, "]\n");
1291
0
    break;
1292
0
  }
1293
0
  fz_close_output(ctx, wri->out);
1294
0
}
1295
1296
static void
1297
text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
1298
0
{
1299
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1300
0
  fz_drop_stext_page(ctx, wri->page);
1301
0
  fz_drop_output(ctx, wri->out);
1302
0
}
1303
1304
fz_document_writer *
1305
fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options)
1306
0
{
1307
0
  fz_text_writer *wri = NULL;
1308
1309
0
  fz_var(wri);
1310
1311
0
  fz_try(ctx)
1312
0
  {
1313
0
    wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
1314
0
    fz_parse_stext_options(ctx, &wri->opts, options);
1315
1316
0
    wri->format = FZ_FORMAT_TEXT;
1317
0
    if (!strcmp(format, "text"))
1318
0
      wri->format = FZ_FORMAT_TEXT;
1319
0
    else if (!strcmp(format, "html"))
1320
0
      wri->format = FZ_FORMAT_HTML;
1321
0
    else if (!strcmp(format, "xhtml"))
1322
0
      wri->format = FZ_FORMAT_XHTML;
1323
0
    else if (!strcmp(format, "stext"))
1324
0
      wri->format = FZ_FORMAT_STEXT_XML;
1325
0
    else if (!strcmp(format, "stext.xml"))
1326
0
      wri->format = FZ_FORMAT_STEXT_XML;
1327
0
    else if (!strcmp(format, "stext.json"))
1328
0
    {
1329
0
      wri->format = FZ_FORMAT_STEXT_JSON;
1330
0
      wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS;
1331
0
    }
1332
1333
0
    wri->out = out;
1334
1335
0
    switch (wri->format)
1336
0
    {
1337
0
    case FZ_FORMAT_HTML:
1338
0
      fz_print_stext_header_as_html(ctx, wri->out);
1339
0
      break;
1340
0
    case FZ_FORMAT_XHTML:
1341
0
      fz_print_stext_header_as_xhtml(ctx, wri->out);
1342
0
      break;
1343
0
    case FZ_FORMAT_STEXT_XML:
1344
0
      fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
1345
0
      fz_write_string(ctx, wri->out, "<document>\n");
1346
0
      break;
1347
0
    case FZ_FORMAT_STEXT_JSON:
1348
0
      fz_write_string(ctx, wri->out, "[");
1349
0
      break;
1350
0
    }
1351
0
  }
1352
0
  fz_catch(ctx)
1353
0
  {
1354
0
    fz_drop_output(ctx, out);
1355
0
    fz_free(ctx, wri);
1356
0
    fz_rethrow(ctx);
1357
0
  }
1358
1359
0
  return (fz_document_writer*)wri;
1360
0
}
1361
1362
fz_document_writer *
1363
fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options)
1364
0
{
1365
0
  fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
1366
0
  return fz_new_text_writer_with_output(ctx, format, out, options);
1367
0
}