Coverage Report

Created: 2025-12-31 07:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/fitz/stext-output.c
Line
Count
Source
1
// Copyright (C) 2004-2025 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
25
#define SUBSCRIPT_OFFSET 0.2f
26
#define SUPERSCRIPT_OFFSET -0.2f
27
28
#include <ft2build.h>
29
#include FT_FREETYPE_H
30
31
// Text black color when converted from DeviceCMYK to RGB
32
0
#define CMYK_BLACK 0x221f1f
33
34
static void
35
scale_run(fz_context *ctx, fz_stext_block *block, float scale)
36
0
{
37
0
  fz_matrix m = fz_scale(scale, scale);
38
0
  fz_stext_line *line;
39
0
  fz_stext_char *ch;
40
41
0
  while (block)
42
0
  {
43
0
    block->bbox = fz_transform_rect(block->bbox, m);
44
0
    switch (block->type)
45
0
    {
46
0
    case FZ_STEXT_BLOCK_TEXT:
47
0
      for (line = block->u.t.first_line; line; line = line->next)
48
0
      {
49
0
        line->bbox = fz_transform_rect(block->bbox, m);
50
0
        for (ch = line->first_char; ch; ch = ch->next)
51
0
        {
52
0
          ch->origin = fz_transform_point(ch->origin, m);
53
0
          ch->quad = fz_transform_quad(ch->quad, m);
54
0
          ch->size = ch->size * scale;
55
0
        }
56
0
      }
57
0
      break;
58
59
0
    case FZ_STEXT_BLOCK_IMAGE:
60
0
      block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale);
61
0
      break;
62
63
0
    case FZ_STEXT_BLOCK_STRUCT:
64
0
      if (block->u.s.down)
65
0
        scale_run(ctx, block->u.s.down->first_block, scale);
66
0
      break;
67
0
    }
68
0
    block = block->next;
69
0
  }
70
0
}
71
72
static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale)
73
0
{
74
0
  scale_run(ctx, page->first_block, scale);
75
0
}
76
77
/* HTML output (visual formatting with preserved layout) */
78
79
static int
80
detect_super_script(fz_stext_line *line, fz_stext_char *ch)
81
0
{
82
0
  if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
83
0
    return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
84
0
  return 0;
85
0
}
86
87
static const char *
88
font_full_name(fz_context *ctx, fz_font *font)
89
0
{
90
0
  const char *name = fz_font_name(ctx, font);
91
0
  const char *s = strchr(name, '+');
92
0
  return s ? s + 1 : name;
93
0
}
94
95
static const char *
96
html_clean_font_name(const char *fontname)
97
0
{
98
0
  if (strstr(fontname, "Times"))
99
0
    return "Times New Roman";
100
0
  if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica"))
101
0
  {
102
0
    if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed"))
103
0
      return "Arial Narrow";
104
0
    return "Arial";
105
0
  }
106
0
  if (strstr(fontname, "Courier"))
107
0
    return "Courier";
108
0
  return fontname;
109
0
}
110
111
static void
112
font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
113
0
{
114
0
  const char *name = html_clean_font_name(font_full_name(ctx, font));
115
0
  char *s;
116
0
  fz_strlcpy(buf, name, size);
117
0
  s = strrchr(buf, '-');
118
0
  if (s)
119
0
    *s = 0;
120
0
  if (is_mono)
121
0
    fz_strlcat(buf, ",monospace", size);
122
0
  else
123
0
    fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
124
0
}
125
126
static void
127
fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
128
0
{
129
0
  char family[80];
130
131
0
  int is_bold = fz_font_is_bold(ctx, font);
132
0
  int is_italic = fz_font_is_italic(ctx, font);
133
0
  int is_serif = fz_font_is_serif(ctx, font);
134
0
  int is_mono = fz_font_is_monospaced(ctx, font);
135
136
0
  font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
137
138
0
  if (sup) fz_write_string(ctx, out, "<sup>");
139
0
  if (is_mono) fz_write_string(ctx, out, "<tt>");
140
0
  if (is_bold) fz_write_string(ctx, out, "<b>");
141
0
  if (is_italic) fz_write_string(ctx, out, "<i>");
142
0
  fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size);
143
0
  if (color != 0 && color != CMYK_BLACK)
144
0
    fz_write_printf(ctx, out, ";color:#%06x", color & 0xffffff);
145
0
  fz_write_printf(ctx, out, "\">");
146
0
}
147
148
static void
149
fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
150
0
{
151
0
  int is_mono = fz_font_is_monospaced(ctx, font);
152
0
  int is_bold = fz_font_is_bold(ctx,font);
153
0
  int is_italic = fz_font_is_italic(ctx, font);
154
155
0
  fz_write_string(ctx, out, "</span>");
156
0
  if (is_italic) fz_write_string(ctx, out, "</i>");
157
0
  if (is_bold) fz_write_string(ctx, out, "</b>");
158
0
  if (is_mono) fz_write_string(ctx, out, "</tt>");
159
0
  if (sup) fz_write_string(ctx, out, "</sup>");
160
0
}
161
162
static void
163
fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
164
0
{
165
0
  fz_matrix ctm = block->u.i.transform;
166
167
0
#define USE_CSS_MATRIX_TRANSFORMS
168
0
#ifdef USE_CSS_MATRIX_TRANSFORMS
169
  /* Matrix maths notes.
170
   * When we get here ctm maps the unit square to the position in device
171
   * space occupied by the image.
172
   *
173
   * That is to say that mapping the 4 corners of the unit square through
174
   * the transform, give us the 4 target corners. We extend the corners
175
   * by adding an extra '1' into them to allow transforms to work. Thus
176
   * (x,y) maps through ctm = (a b c d e f) as:
177
   *
178
   * (x y 1) (a b 0) = (X Y 1)
179
   *         (c d 0)
180
   *         (e f 1)
181
   *
182
   * To simplify reading of matrix maths, we use the trick where we
183
   * 'drop' the first matrix down the page. Thus the corners c0=(0,0),
184
   * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively:
185
   *
186
   *         (    a     b 0)
187
   *         (    c     d 0)
188
   *         (    e     f 1)
189
   * (0 0 1) (    e     f 1)
190
   * (0 1 1) (  c+e   d+f 1)
191
   * (1 1 1) (a+c+e b+d+f 1)
192
   * (1 0 1) (  a+e   b+f 1)
193
   *
194
   * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f)
195
   *
196
   * Unfortunately, the CSS matrix transform, does not map the unit square.
197
   * Rather it does something moderately mad. As far as I can work out, the
198
   * top left corner of a (0,0) -> (w, h) box is transformed using the .e
199
   * and .f entries of the matrix. Then the image from within that square
200
   * is transformed using the centre of that square as the origin.
201
   *
202
   * So, an image placed at (0,0) in destination space with 1:1 transform
203
   * will result in an image a (0,0) as you'd expect. But an image at (0,0)
204
   * with a scale of 2, will result in 25% of the image off the left of the
205
   * screen, and 25% off the top.
206
   *
207
   * Accordingly, we have to adjust the ctm in several steps.
208
   */
209
  /* Move to moving the centre of the image. */
210
0
  ctm.e += (ctm.a+ctm.c)/2;
211
0
  ctm.f += (ctm.b+ctm.d)/2;
212
  /* Move from transforming the unit square to w/h */
213
0
  ctm.a /= block->u.i.image->w;
214
0
  ctm.b /= block->u.i.image->w;
215
0
  ctm.c /= block->u.i.image->h;
216
0
  ctm.d /= block->u.i.image->h;
217
  /* Move from points to pixels */
218
0
  ctm.a *= 96.0f/72;
219
0
  ctm.b *= 96.0f/72;
220
0
  ctm.c *= 96.0f/72;
221
0
  ctm.d *= 96.0f/72;
222
0
  ctm.e *= 96.0f/72;
223
0
  ctm.f *= 96.0f/72;
224
  /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */
225
0
  ctm.e -= block->u.i.image->w/2;
226
0
  ctm.f -= block->u.i.image->h/2;
227
228
0
  fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"",
229
0
    ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f);
230
#else
231
  /* Alternative version of the code that uses scaleX/Y and rotate
232
   * instead, but only copes with axis aligned cases. */
233
  int t;
234
235
  int x = block->bbox.x0;
236
  int y = block->bbox.y0;
237
  int w = block->bbox.x1 - block->bbox.x0;
238
  int h = block->bbox.y1 - block->bbox.y0;
239
240
  const char *flip = "";
241
242
  if (ctm.b == 0 && ctm.c == 0)
243
  {
244
    if (ctm.a < 0 && ctm.d < 0)
245
      flip = "transform: scaleX(-1) scaleY(-1);";
246
    else if (ctm.a < 0)
247
    {
248
      flip = "transform: scaleX(-1);";
249
    }
250
    else if (ctm.d < 0)
251
    {
252
      flip = "transform: scaleY(-1);";
253
    }
254
  } else if (ctm.a == 0 && ctm.d == 0) {
255
    if (ctm.b < 0 && ctm.c < 0)
256
    {
257
      flip = "transform: scaleY(-1) rotate(90deg);";
258
      x += (w-h)/2;
259
      y -= (w-h)/2;
260
      t = w; w = h; h = t;
261
    }
262
    else if (ctm.b < 0)
263
    {
264
      flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);";
265
      x += (w-h)/2;
266
      y -= (w-h)/2;
267
      t = w; w = h; h = t;
268
    }
269
    else if (ctm.c < 0)
270
    {
271
      flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);";
272
      x += (w-h)/2;
273
      y -= (w-h)/2;
274
      t = w; w = h; h = t;
275
    }
276
    else
277
    {
278
      flip = "transform: scaleY(-1) rotate(270deg);";
279
      x += (w-h)/2;
280
      y -= (w-h)/2;
281
      t = w; w = h; h = t;
282
    }
283
  }
284
285
  fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h);
286
#endif
287
0
  fz_write_image_as_data_uri(ctx, out, block->u.i.image);
288
0
  fz_write_string(ctx, out, "\">\n");
289
0
}
290
291
void
292
fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
293
0
{
294
0
  fz_stext_line *line;
295
0
  fz_stext_char *ch;
296
0
  float x, y, h;
297
298
0
  fz_font *font = NULL;
299
0
  float size = 0;
300
0
  int sup = 0;
301
0
  uint32_t color = 0;
302
303
0
  for (line = block->u.t.first_line; line; line = line->next)
304
0
  {
305
0
    x = line->bbox.x0;
306
0
    y = line->bbox.y0;
307
0
    h = line->bbox.y1 - line->bbox.y0;
308
309
0
    if (line->first_char)
310
0
    {
311
0
      h = line->first_char->size;
312
0
      y = line->first_char->origin.y - h * 0.8f;
313
0
    }
314
315
0
    fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h);
316
0
    font = NULL;
317
318
0
    for (ch = line->first_char; ch; ch = ch->next)
319
0
    {
320
0
      int ch_sup = detect_super_script(line, ch);
321
0
      if (ch->font != font || ch->size != size || ch_sup != sup || ch->argb != color)
322
0
      {
323
0
        if (font)
324
0
          fz_print_style_end_html(ctx, out, font, size, sup, color);
325
0
        font = ch->font;
326
0
        size = ch->size;
327
0
        color = ch->argb;
328
0
        sup = ch_sup;
329
0
        fz_print_style_begin_html(ctx, out, font, size, sup, color);
330
0
      }
331
332
0
      switch (ch->c)
333
0
      {
334
0
      default:
335
0
        if (ch->c >= 32 && ch->c <= 127)
336
0
          fz_write_byte(ctx, out, ch->c);
337
0
        else
338
0
          fz_write_printf(ctx, out, "&#x%x;", ch->c);
339
0
        break;
340
0
      case '<': fz_write_string(ctx, out, "&lt;"); break;
341
0
      case '>': fz_write_string(ctx, out, "&gt;"); break;
342
0
      case '&': fz_write_string(ctx, out, "&amp;"); break;
343
0
      case '"': fz_write_string(ctx, out, "&quot;"); break;
344
0
      case '\'': fz_write_string(ctx, out, "&apos;"); break;
345
0
      }
346
0
    }
347
348
0
    if (font)
349
0
      fz_print_style_end_html(ctx, out, font, size, sup, color);
350
351
0
    fz_write_string(ctx, out, "</p>\n");
352
0
  }
353
0
}
354
355
static const char *
356
html_tag_for_struct(fz_stext_struct *s)
357
0
{
358
0
  const char *raw;
359
360
0
  if (s == NULL)
361
0
    return "DIV";
362
363
0
  raw = s->raw;
364
0
  if (raw == NULL)
365
0
    raw = fz_structure_to_string(s->standard);
366
367
0
  if (!fz_strcasecmp(raw, "blockquote"))
368
0
    return "blockquote";
369
0
  if (!fz_strcasecmp(raw, "title"))
370
0
    return "h1";
371
0
  if (!fz_strcasecmp(raw, "sub"))
372
0
    return "sub";
373
0
  if (!fz_strcasecmp(raw, "p"))
374
0
    return "p";
375
0
  if (!fz_strcasecmp(raw, "h"))
376
0
    return "h1"; /* Pick one! */
377
0
  if (!fz_strcasecmp(raw, "h1"))
378
0
    return "h1";
379
0
  if (!fz_strcasecmp(raw, "h2"))
380
0
    return "h2";
381
0
  if (!fz_strcasecmp(raw, "h3"))
382
0
    return "h3";
383
0
  if (!fz_strcasecmp(raw, "h4"))
384
0
    return "h4";
385
0
  if (!fz_strcasecmp(raw, "h5"))
386
0
    return "h5";
387
0
  if (!fz_strcasecmp(raw, "h6"))
388
0
    return "h6";
389
390
0
  if (!fz_strcasecmp(raw, "list"))
391
0
    return "ul";
392
0
  if (!fz_strcasecmp(raw, "listitem"))
393
0
    return "li";
394
0
  if (!fz_strcasecmp(raw, "table"))
395
0
    return "table";
396
0
  if (!fz_strcasecmp(raw, "tr"))
397
0
    return "tr";
398
0
  if (!fz_strcasecmp(raw, "th"))
399
0
    return "th";
400
0
  if (!fz_strcasecmp(raw, "td"))
401
0
    return "td";
402
0
  if (!fz_strcasecmp(raw, "thead"))
403
0
    return "thead";
404
0
  if (!fz_strcasecmp(raw, "tbody"))
405
0
    return "tbody";
406
0
  if (!fz_strcasecmp(raw, "tfoot"))
407
0
    return "tfoot";
408
409
0
  if (!fz_strcasecmp(raw, "span"))
410
0
    return "span";
411
0
  if (!fz_strcasecmp(raw, "code"))
412
0
    return "code";
413
0
  if (!fz_strcasecmp(raw, "em"))
414
0
    return "em";
415
0
  if (!fz_strcasecmp(raw, "strong"))
416
0
    return "strong";
417
418
0
  return "div";
419
0
}
420
421
static void
422
print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block);
423
424
static void
425
fz_print_stext_struct_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
426
0
{
427
0
  const char *tag;
428
429
0
  if (block->u.s.down == NULL)
430
0
    return;
431
432
0
  tag = html_tag_for_struct(block->u.s.down);
433
434
0
  fz_write_printf(ctx, out, "<%s>\n", tag);
435
436
0
  print_blocks_as_html(ctx, out, block->u.s.down->first_block);
437
438
0
  fz_write_printf(ctx, out, "</%s>\n", tag);
439
0
}
440
441
static void
442
print_blocks_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
443
0
{
444
0
  for (; block; block = block->next)
445
0
  {
446
0
    if (block->type == FZ_STEXT_BLOCK_IMAGE)
447
0
      fz_print_stext_image_as_html(ctx, out, block);
448
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
449
0
      fz_print_stext_block_as_html(ctx, out, block);
450
0
    else if (block->type == FZ_STEXT_BLOCK_STRUCT)
451
0
      fz_print_stext_struct_as_html(ctx, out, block);
452
0
  }
453
0
}
454
455
void
456
fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
457
0
{
458
0
  float w = page->mediabox.x1 - page->mediabox.x0;
459
0
  float h = page->mediabox.y1 - page->mediabox.y0;
460
461
0
  fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h);
462
463
0
  print_blocks_as_html(ctx, out, page->first_block);
464
465
0
  fz_write_string(ctx, out, "</div>\n");
466
0
}
467
468
void
469
fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
470
0
{
471
0
  fz_write_string(ctx, out, "<!DOCTYPE html>\n");
472
0
  fz_write_string(ctx, out, "<html>\n");
473
0
  fz_write_string(ctx, out, "<head>\n");
474
0
  fz_write_string(ctx, out, "<style>\n");
475
0
  fz_write_string(ctx, out, "body{background-color:slategray}\n");
476
0
  fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n");
477
0
  fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n");
478
0
  fz_write_string(ctx, out, "</style>\n");
479
0
  fz_write_string(ctx, out, "</head>\n");
480
0
  fz_write_string(ctx, out, "<body>\n");
481
0
}
482
483
void
484
fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
485
0
{
486
0
  fz_write_string(ctx, out, "</body>\n");
487
0
  fz_write_string(ctx, out, "</html>\n");
488
0
}
489
490
/* XHTML output (semantic, little layout, suitable for reflow) */
491
492
static void
493
find_table_pos(fz_stext_grid_positions *xs, float x0, float x1, int *ix0, int *ix1)
494
0
{
495
0
  int i;
496
497
0
  *ix0 = -1;
498
0
  *ix1 = -1;
499
500
0
  for (i = 1; i < xs->len; i++)
501
0
    if (x0 < xs->list[i].pos)
502
0
    {
503
0
      *ix0 = i-1;
504
0
      break;
505
0
    }
506
0
  for (; i < xs->len; i++)
507
0
    if (x1 < xs->list[i].pos)
508
0
    {
509
0
      *ix1 = i-1;
510
0
      break;
511
0
    }
512
0
  if (i == xs->len)
513
0
    *ix1 = i-1;
514
0
}
515
516
static void
517
run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out);
518
519
static unsigned int
520
grid_flags(fz_stext_grid_info *info, int x, int y)
521
0
{
522
0
  if (info == NULL || x < 0 || y < 0 || x >= info->w || y >= info->h)
523
0
    return 0;
524
0
  return info->info[y * info->w + x].flags;
525
0
}
526
527
static void
528
start_cell(fz_context *ctx, fz_output *out, fz_stext_grid_info *info, int x, int y)
529
0
{
530
0
  unsigned int flags = grid_flags(info, x, y);
531
0
  unsigned int flagsr = grid_flags(info, x+1, y);
532
0
  unsigned int flagsb = grid_flags(info, x, y+1);
533
0
  fz_write_string(ctx, out, "<td");
534
0
  if (info == NULL)
535
0
    return;
536
0
  if ((flags & FZ_STEXT_GRID_L_BORDER) == 0 &&
537
0
    (flags & FZ_STEXT_GRID_T_BORDER) == 0 &&
538
0
    (flagsr & FZ_STEXT_GRID_L_BORDER) == 0 &&
539
0
    (flagsb & FZ_STEXT_GRID_T_BORDER) == 0)
540
0
    return;
541
0
  fz_write_string(ctx, out, " style=\"border-style:");
542
0
  if (flags & FZ_STEXT_GRID_T_BORDER)
543
0
    fz_write_string(ctx, out, "solid ");
544
0
  else
545
0
    fz_write_string(ctx, out, "none ");
546
0
  if (flagsr & FZ_STEXT_GRID_L_BORDER)
547
0
    fz_write_string(ctx, out, "solid ");
548
0
  else
549
0
    fz_write_string(ctx, out, "none ");
550
0
  if (flagsb & FZ_STEXT_GRID_T_BORDER)
551
0
    fz_write_string(ctx, out, "solid ");
552
0
  else
553
0
    fz_write_string(ctx, out, "none ");
554
0
  if (flags & FZ_STEXT_GRID_L_BORDER)
555
0
    fz_write_string(ctx, out, "solid;\"");
556
0
  else
557
0
    fz_write_string(ctx, out, "none;\"");
558
559
0
}
560
561
static void
562
fz_print_stext_table_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
563
0
{
564
0
  fz_stext_block *grid, *tr, *td;
565
0
  int w, h;
566
0
  int x, y;
567
0
  uint8_t *cells;
568
0
  int malformed = 0;
569
570
0
  for (grid = block; grid != NULL; grid = grid->next)
571
0
    if (grid->type == FZ_STEXT_BLOCK_GRID)
572
0
      break;
573
0
  if (grid == NULL)
574
0
  {
575
0
    fz_warn(ctx, "Malformed table data");
576
0
    return;
577
0
  }
578
0
  w = grid->u.b.xs->len;
579
0
  h = grid->u.b.ys->len;
580
0
  cells = fz_calloc(ctx, w, h);
581
582
0
  fz_try(ctx)
583
0
  {
584
0
    fz_write_printf(ctx, out, "<table style=\"border-collapse: collapse;\">\n");
585
586
0
    y = 0;
587
0
    for (tr = grid->next; tr != NULL; tr = tr->next)
588
0
    {
589
0
      if (tr->type != FZ_STEXT_BLOCK_STRUCT || tr->u.s.down == NULL || tr->u.s.down->standard != FZ_STRUCTURE_TR)
590
0
      {
591
0
        malformed = 1;
592
0
        continue;
593
0
      }
594
0
      fz_write_printf(ctx, out, "<tr>\n");
595
0
      x = 0;
596
0
      for (td = tr->u.s.down->first_block; td != NULL; td = td->next)
597
0
      {
598
0
        int x0, y0, x1, y1;
599
0
        if (td->type != FZ_STEXT_BLOCK_STRUCT || td->u.s.down == NULL || td->u.s.down->standard != FZ_STRUCTURE_TD)
600
0
        {
601
0
          malformed = 1;
602
0
          continue;
603
0
        }
604
0
        find_table_pos(grid->u.b.xs, td->bbox.x0, td->bbox.x1, &x0, &x1);
605
0
        find_table_pos(grid->u.b.ys, td->bbox.y0, td->bbox.y1, &y0, &y1);
606
0
        if (x0 < 0 || x1 < 0 || x1 >= w)
607
0
        {
608
0
          malformed = 1;
609
0
          x0 = x;
610
0
          x1 = x+1;
611
0
        }
612
0
        if (y0 < 0 || y1 < 0 || y1 >= h)
613
0
        {
614
0
          malformed = 1;
615
0
          y0 = y;
616
0
          y1 = y+1;
617
0
        }
618
0
        if (y < y0)
619
0
        {
620
0
          malformed = 1;
621
0
          continue;
622
0
        }
623
0
        if (x > x0)
624
0
        {
625
0
          malformed = 1;
626
0
        }
627
0
        while (x < x0)
628
0
        {
629
0
          uint8_t *c = &cells[x + w*y];
630
0
          if (*c == 0)
631
0
          {
632
0
            start_cell(ctx, out, grid->u.b.info, x, y);
633
0
            fz_write_printf(ctx, out, "></td>");
634
0
            *c = 1;
635
0
          }
636
0
          x++;
637
0
        }
638
0
        start_cell(ctx, out, grid->u.b.info, x, y);
639
0
        if (x1 > x0+1)
640
0
          fz_write_printf(ctx, out, " colspan=\"%d\"", x1-x0);
641
0
        if (y1 > y0+1)
642
0
          fz_write_printf(ctx, out, " rowspan=\"%d\"", y1-y0);
643
0
        fz_write_string(ctx, out, ">");
644
0
        run_to_xhtml(ctx, td->u.s.down->first_block, out);
645
0
        fz_write_printf(ctx, out, "</td>\n");
646
0
        for ( ; y0 < y1; y0++)
647
0
          for (x = x0; x < x1; x++)
648
0
          {
649
0
            uint8_t *c = &cells[x + w*y0];
650
0
            if (*c != 0)
651
0
              malformed = 1;
652
0
            *c = 1;
653
0
          }
654
0
      }
655
0
      fz_write_printf(ctx, out, "</tr>\n");
656
0
      y++;
657
0
    }
658
659
0
    fz_write_printf(ctx, out, "</table>\n");
660
0
  }
661
0
  fz_always(ctx)
662
0
    fz_free(ctx, cells);
663
0
  fz_catch(ctx)
664
0
    fz_rethrow(ctx);
665
666
0
  if (malformed)
667
0
    fz_warn(ctx, "Malformed table data");
668
0
}
669
670
static void
671
fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
672
0
{
673
0
  int w = block->bbox.x1 - block->bbox.x0;
674
0
  int h = block->bbox.y1 - block->bbox.y0;
675
676
0
  fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
677
0
  fz_write_image_as_data_uri(ctx, out, block->u.i.image);
678
0
  fz_write_string(ctx, out, "\"/></p>\n");
679
0
}
680
681
static void
682
fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
683
0
{
684
0
  int is_mono = fz_font_is_monospaced(ctx, font);
685
0
  int is_bold = fz_font_is_bold(ctx, font);
686
0
  int is_italic = fz_font_is_italic(ctx, font);
687
688
0
  if (sup)
689
0
    fz_write_string(ctx, out, "<sup>");
690
0
  if (is_mono)
691
0
    fz_write_string(ctx, out, "<tt>");
692
0
  if (is_bold)
693
0
    fz_write_string(ctx, out, "<b>");
694
0
  if (is_italic)
695
0
    fz_write_string(ctx, out, "<i>");
696
0
}
697
698
static void
699
fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
700
0
{
701
0
  int is_mono = fz_font_is_monospaced(ctx, font);
702
0
  int is_bold = fz_font_is_bold(ctx, font);
703
0
  int is_italic = fz_font_is_italic(ctx, font);
704
705
0
  if (is_italic)
706
0
    fz_write_string(ctx, out, "</i>");
707
0
  if (is_bold)
708
0
    fz_write_string(ctx, out, "</b>");
709
0
  if (is_mono)
710
0
    fz_write_string(ctx, out, "</tt>");
711
0
  if (sup)
712
0
    fz_write_string(ctx, out, "</sup>");
713
0
}
714
715
static float avg_font_size_of_line(fz_stext_char *ch)
716
0
{
717
0
  float size = 0;
718
0
  int n = 0;
719
0
  if (!ch)
720
0
    return 0;
721
0
  while (ch)
722
0
  {
723
0
    size += ch->size;
724
0
    ++n;
725
0
    ch = ch->next;
726
0
  }
727
0
  return size / n;
728
0
}
729
730
static const char *tag_from_font_size(float size)
731
0
{
732
0
  if (size >= 20) return "h1";
733
0
  if (size >= 15) return "h2";
734
0
  if (size >= 12) return "h3";
735
0
  return "p";
736
0
}
737
738
static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
739
0
{
740
0
  fz_stext_line *line;
741
0
  fz_stext_char *ch;
742
743
0
  fz_font *font = NULL;
744
0
  int sup = 0;
745
0
  int sp = 1;
746
0
  const char *tag = NULL;
747
0
  const char *new_tag;
748
749
0
  for (line = block->u.t.first_line; line; line = line->next)
750
0
  {
751
0
    new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
752
0
    if (tag != new_tag)
753
0
    {
754
0
      if (tag)
755
0
      {
756
0
        if (font)
757
0
          fz_print_style_end_xhtml(ctx, out, font, sup);
758
0
        fz_write_printf(ctx, out, "</%s>", tag);
759
0
      }
760
0
      tag = new_tag;
761
0
      fz_write_printf(ctx, out, "<%s>", tag);
762
0
      if (font)
763
0
        fz_print_style_begin_xhtml(ctx, out, font, sup);
764
0
    }
765
766
0
    if (!sp)
767
0
      fz_write_byte(ctx, out, ' ');
768
769
0
    for (ch = line->first_char; ch; ch = ch->next)
770
0
    {
771
0
      int ch_sup = detect_super_script(line, ch);
772
0
      if (ch->font != font || ch_sup != sup)
773
0
      {
774
0
        if (font)
775
0
          fz_print_style_end_xhtml(ctx, out, font, sup);
776
0
        font = ch->font;
777
0
        sup = ch_sup;
778
0
        fz_print_style_begin_xhtml(ctx, out, font, sup);
779
0
      }
780
781
0
      sp = (ch->c == ' ');
782
      /* Skip hyphens on line joins */
783
0
      if (ch->next == NULL && (line->flags & FZ_STEXT_LINE_FLAGS_JOINED) != 0 && fz_is_unicode_hyphen(ch->c))
784
0
      {
785
0
        sp = 1;
786
0
        continue;
787
0
      }
788
0
      switch (ch->c)
789
0
      {
790
0
      default:
791
0
        if (ch->c >= 32 && ch->c <= 127)
792
0
          fz_write_byte(ctx, out, ch->c);
793
0
        else
794
0
          fz_write_printf(ctx, out, "&#x%x;", ch->c);
795
0
        break;
796
0
      case '<': fz_write_string(ctx, out, "&lt;"); break;
797
0
      case '>': fz_write_string(ctx, out, "&gt;"); break;
798
0
      case '&': fz_write_string(ctx, out, "&amp;"); break;
799
0
      case '"': fz_write_string(ctx, out, "&quot;"); break;
800
0
      case '\'': fz_write_string(ctx, out, "&apos;"); break;
801
0
      }
802
0
    }
803
0
  }
804
805
0
  if (font)
806
0
    fz_print_style_end_xhtml(ctx, out, font, sup);
807
0
  fz_write_printf(ctx, out, "</%s>\n", tag);
808
0
}
809
810
static void
811
fz_print_struct_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
812
0
{
813
0
  const char *tag;
814
815
0
  if (block->u.s.down == NULL)
816
0
    return;
817
818
0
  if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
819
0
  {
820
0
    fz_print_stext_table_as_xhtml(ctx, out, block->u.s.down->first_block);
821
0
    return;
822
0
  }
823
824
0
  tag = html_tag_for_struct(block->u.s.down);
825
826
0
  fz_write_printf(ctx, out, "<%s>\n", tag);
827
828
0
  run_to_xhtml(ctx, block->u.s.down->first_block, out);
829
830
0
  fz_write_printf(ctx, out, "</%s>\n", tag);
831
0
}
832
833
static void
834
run_to_xhtml(fz_context *ctx, fz_stext_block *block, fz_output *out)
835
0
{
836
0
  while (block)
837
0
  {
838
0
    switch(block->type)
839
0
    {
840
0
    case FZ_STEXT_BLOCK_IMAGE:
841
0
      fz_print_stext_image_as_xhtml(ctx, out, block);
842
0
      break;
843
0
    case FZ_STEXT_BLOCK_TEXT:
844
0
      fz_print_stext_block_as_xhtml(ctx, out, block);
845
0
      break;
846
0
    case FZ_STEXT_BLOCK_STRUCT:
847
0
      fz_print_struct_as_xhtml(ctx, out, block);
848
0
      break;
849
0
    }
850
0
    block = block->next;
851
0
  }
852
0
}
853
854
void
855
fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
856
0
{
857
0
  fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
858
859
0
  run_to_xhtml(ctx, page->first_block, out);
860
861
0
  fz_write_string(ctx, out, "</div>\n");
862
0
}
863
864
void
865
fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
866
0
{
867
0
  fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
868
0
  fz_write_string(ctx, out, "<!DOCTYPE html");
869
0
  fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
870
0
  fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
871
0
  fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
872
0
  fz_write_string(ctx, out, "<head>\n");
873
0
  fz_write_string(ctx, out, "<style>\n");
874
0
  fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
875
0
  fz_write_string(ctx, out, "</style>\n");
876
0
  fz_write_string(ctx, out, "</head>\n");
877
0
  fz_write_string(ctx, out, "<body>\n");
878
0
}
879
880
void
881
fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
882
0
{
883
0
  fz_write_string(ctx, out, "</body>\n");
884
0
  fz_write_string(ctx, out, "</html>\n");
885
0
}
886
887
/* Detailed XML dump of the entire structured text data */
888
889
static void
890
xml_write_char(fz_context *ctx, fz_output *out, int c)
891
0
{
892
0
  switch (c)
893
0
  {
894
0
  case '<': fz_write_string(ctx, out, "&lt;"); break;
895
0
  case '>': fz_write_string(ctx, out, "&gt;"); break;
896
0
  case '&': fz_write_string(ctx, out, "&amp;"); break;
897
0
  case '"': fz_write_string(ctx, out, "&quot;"); break;
898
0
  case '\'': fz_write_string(ctx, out, "&apos;"); break;
899
0
  default:
900
0
    if (c >= 32 && c <= 127)
901
0
      fz_write_printf(ctx, out, "%c", c);
902
0
    else
903
0
      fz_write_printf(ctx, out, "&#x%x;", c);
904
0
    break;
905
0
  }
906
0
}
907
908
static void
909
as_xml(fz_context *ctx, fz_stext_block *block, fz_output *out)
910
0
{
911
0
  fz_stext_line *line;
912
0
  fz_stext_char *ch;
913
0
  int i;
914
915
0
  while (block)
916
0
  {
917
0
    switch (block->type)
918
0
    {
919
0
    case FZ_STEXT_BLOCK_TEXT:
920
0
      fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\"",
921
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
922
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_UNKNOWN)
923
0
        fz_write_printf(ctx, out, " justify=\"unknown\"");
924
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_LEFT)
925
0
        fz_write_printf(ctx, out, " justify=\"left\"");
926
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_CENTRE)
927
0
        fz_write_printf(ctx, out, " justify=\"centre\"");
928
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_RIGHT)
929
0
        fz_write_printf(ctx, out, " justify=\"right\"");
930
0
      if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
931
0
        fz_write_printf(ctx, out, " justify=\"full\"");
932
0
      fz_write_printf(ctx, out, ">\n");
933
0
      for (line = block->u.t.first_line; line; line = line->next)
934
0
      {
935
0
        fz_font *font = NULL;
936
0
        float size = 0;
937
0
        const char *name = NULL;
938
939
0
        fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\" flags=\"%d\"",
940
0
            line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
941
0
            line->wmode,
942
0
            line->dir.x, line->dir.y, line->flags);
943
944
        /* This is duplication of information, but it makes it MUCH easier to search for
945
         * text fragments in large output. */
946
0
        {
947
0
          int valid = 1;
948
0
          fz_write_printf(ctx, out, " text=\"");
949
0
          for (ch = line->first_char; ch; ch = ch->next)
950
0
          {
951
0
            if (valid)
952
0
              valid = fz_is_valid_xml_char(ch->c);
953
0
            xml_write_char(ctx, out, fz_range_limit_xml_char(ch->c));
954
0
          }
955
0
          if (!valid)
956
0
          {
957
0
            fz_write_printf(ctx, out, "\" hextext=\"");
958
0
            for (ch = line->first_char; ch; ch = ch->next)
959
0
            {
960
0
              char text[8];
961
0
              int n = fz_runetochar(text, ch->c);
962
0
              for (i = 0; i < n; i++)
963
0
                fz_write_printf(ctx, out, "%02x", text[i]);
964
0
            }
965
0
          }
966
0
          fz_write_printf(ctx, out, "\"");
967
0
        }
968
969
0
        fz_write_printf(ctx, out, ">\n");
970
971
0
        for (ch = line->first_char; ch; ch = ch->next)
972
0
        {
973
0
          if (ch->font != font || ch->size != size)
974
0
          {
975
0
            const char *s;
976
0
            if (font)
977
0
              fz_write_string(ctx, out, "</font>\n");
978
0
            font = ch->font;
979
0
            size = ch->size;
980
0
            s = name = font_full_name(ctx, font);
981
0
            while (*s)
982
0
            {
983
0
              int c = *s++;
984
0
              if (c < 32 || c >= 127)
985
0
                break;
986
0
            }
987
0
            if (*s)
988
0
              fz_write_printf(ctx, out, "<font hexname=%>", name);
989
0
            else
990
0
              fz_write_printf(ctx, out, "<font name=\"%s\"", name);
991
0
            fz_write_printf(ctx, out, " size=\"%g\">\n", size);
992
0
          }
993
0
          fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" alpha=\"#%02x\" flags=\"%d\" c=\"",
994
0
              ch->quad.ul.x, ch->quad.ul.y,
995
0
              ch->quad.ur.x, ch->quad.ur.y,
996
0
              ch->quad.ll.x, ch->quad.ll.y,
997
0
              ch->quad.lr.x, ch->quad.lr.y,
998
0
              ch->origin.x, ch->origin.y,
999
0
              ch->bidi,
1000
0
              ch->argb & 0xFFFFFF,
1001
0
              ch->argb>>24,
1002
0
              ch->flags);
1003
0
          xml_write_char(ctx, out, ch->c);
1004
0
          if (!fz_is_valid_xml_char(ch->c))
1005
0
          {
1006
0
            char text[8];
1007
0
            int n = fz_runetochar(text, ch->c);
1008
0
            fz_write_string(ctx, out, "\" hexc=\"");
1009
0
            for (i = 0; i < n; i++)
1010
0
              fz_write_printf(ctx, out, "%02x", text[i]);
1011
0
          }
1012
0
          fz_write_string(ctx, out, "\"/>\n");
1013
0
        }
1014
1015
0
        if (font)
1016
0
          fz_write_string(ctx, out, "</font>\n");
1017
1018
0
        fz_write_string(ctx, out, "</line>\n");
1019
0
      }
1020
0
      fz_write_string(ctx, out, "</block>\n");
1021
0
      break;
1022
1023
0
    case FZ_STEXT_BLOCK_IMAGE:
1024
0
      fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
1025
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
1026
0
      break;
1027
1028
0
    case FZ_STEXT_BLOCK_STRUCT:
1029
0
      fz_write_printf(ctx, out, "<struct idx=\"%d\" bbox=\"%g %g %g %g\"", block->u.s.index,
1030
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
1031
0
      if (block->u.s.down)
1032
0
        fz_write_printf(ctx, out, " raw=\"%s\" std=\"%s\"",
1033
0
            block->u.s.down->raw, fz_structure_to_string(block->u.s.down->standard));
1034
0
      fz_write_printf(ctx, out, ">\n");
1035
0
      if (block->u.s.down)
1036
0
        as_xml(ctx, block->u.s.down->first_block, out);
1037
0
      fz_write_printf(ctx, out, "</struct>\n");
1038
0
      break;
1039
1040
0
    case FZ_STEXT_BLOCK_VECTOR:
1041
0
      fz_write_printf(ctx, out, "<vector bbox=\"%g %g %g %g\" stroke=\"%d\" rectangle=\"%d\" continues=\"%d\" argb=\"%08x\"/>\n",
1042
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1,
1043
0
          !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED),
1044
0
          !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE),
1045
0
          !!(block->u.v.flags & FZ_STEXT_VECTOR_CONTINUES),
1046
0
          block->u.v.argb);
1047
0
      break;
1048
1049
0
    case FZ_STEXT_BLOCK_GRID:
1050
0
      fz_write_printf(ctx, out, "<grid xpos=\"");
1051
0
      for (i = 0; i < block->u.b.xs->len; i++)
1052
0
        fz_write_printf(ctx, out, "%g ", block->u.b.xs->list[i].pos);
1053
0
      fz_write_printf(ctx, out, "\" xuncertainty=\"");
1054
0
      for (i = 0; i < block->u.b.xs->len; i++)
1055
0
        fz_write_printf(ctx, out, "%d ", block->u.b.xs->list[i].uncertainty);
1056
0
      fz_write_printf(ctx, out, "\" xmaxuncertainty=\"%d\" ypos=\"", block->u.b.xs->max_uncertainty);
1057
0
      for (i = 0; i < block->u.b.ys->len; i++)
1058
0
        fz_write_printf(ctx, out, "%g ", block->u.b.ys->list[i].pos);
1059
0
      fz_write_printf(ctx, out, "\" yuncertainty=\"");
1060
0
      for (i = 0; i < block->u.b.ys->len; i++)
1061
0
        fz_write_printf(ctx, out, "%d ", block->u.b.ys->list[i].uncertainty);
1062
0
      fz_write_printf(ctx, out, "\" ymaxuncertainty=\"%d\" />\n", block->u.b.ys->max_uncertainty);
1063
0
      break;
1064
0
    }
1065
0
    block = block->next;
1066
0
  }
1067
0
}
1068
1069
void
1070
fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
1071
0
{
1072
0
  fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
1073
0
    page->mediabox.x1 - page->mediabox.x0,
1074
0
    page->mediabox.y1 - page->mediabox.y0);
1075
1076
0
  as_xml(ctx, page->first_block, out);
1077
1078
0
  fz_write_string(ctx, out, "</page>\n");
1079
0
}
1080
1081
/* JSON dump */
1082
1083
static void
1084
as_json(fz_context *ctx, fz_stext_block *block, fz_output *out, float scale)
1085
0
{
1086
0
  fz_stext_line *line;
1087
0
  fz_stext_char *ch;
1088
0
  int comma = 0;
1089
1090
0
  while (block)
1091
0
  {
1092
0
    if (comma)
1093
0
      fz_write_string(ctx, out, ",");
1094
0
    comma = 1;
1095
1096
0
    switch (block->type)
1097
0
    {
1098
0
    case FZ_STEXT_BLOCK_TEXT:
1099
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "text");
1100
0
      fz_write_printf(ctx, out, "%q:{", "bbox");
1101
0
      fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
1102
0
      fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
1103
0
      fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
1104
0
      fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
1105
0
      fz_write_printf(ctx, out, "%q:[", "lines");
1106
1107
0
      for (line = block->u.t.first_line; line; line = line->next)
1108
0
      {
1109
0
        if (line != block->u.t.first_line)
1110
0
          fz_write_string(ctx, out, ",");
1111
0
        fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode);
1112
0
        fz_write_printf(ctx, out, "%q:{", "bbox");
1113
0
        fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale));
1114
0
        fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale));
1115
0
        fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale));
1116
0
        fz_write_printf(ctx, out, "%q:%d,", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale));
1117
0
        fz_write_printf(ctx, out, "%q:%d},", "flags", line->flags);
1118
1119
        /* Since we force preserve-spans, the first char has the style for the entire line. */
1120
0
        if (line->first_char)
1121
0
        {
1122
0
          fz_font *font = line->first_char->font;
1123
0
          char *font_family = "sans-serif";
1124
0
          char *font_weight = "normal";
1125
0
          char *font_style = "normal";
1126
0
          if (fz_font_is_monospaced(ctx, font)) font_family = "monospace";
1127
0
          else if (fz_font_is_serif(ctx, font)) font_family = "serif";
1128
0
          if (fz_font_is_bold(ctx, font)) font_weight = "bold";
1129
0
          if (fz_font_is_italic(ctx, font)) font_style = "italic";
1130
0
          fz_write_printf(ctx, out, "%q:{", "font");
1131
0
          fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font));
1132
0
          fz_write_printf(ctx, out, "%q:%q,", "family", font_family);
1133
0
          fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight);
1134
0
          fz_write_printf(ctx, out, "%q:%q,", "style", font_style);
1135
0
          fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale));
1136
0
          fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale));
1137
0
          fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale));
1138
0
        }
1139
1140
0
        fz_write_printf(ctx, out, "%q:\"", "text");
1141
0
        for (ch = line->first_char; ch; ch = ch->next)
1142
0
        {
1143
0
          if (ch->c == '"' || ch->c == '\\')
1144
0
            fz_write_printf(ctx, out, "\\%c", ch->c);
1145
0
          else if (ch->c < 32)
1146
0
            fz_write_printf(ctx, out, "\\u%04x", ch->c);
1147
0
          else
1148
0
            fz_write_printf(ctx, out, "%C", ch->c);
1149
0
        }
1150
0
        fz_write_printf(ctx, out, "\"}");
1151
0
      }
1152
0
      fz_write_string(ctx, out, "]}");
1153
0
      break;
1154
1155
0
    case FZ_STEXT_BLOCK_IMAGE:
1156
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "image");
1157
0
      fz_write_printf(ctx, out, "%q:{", "bbox");
1158
0
      fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
1159
0
      fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
1160
0
      fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
1161
0
      fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
1162
0
      break;
1163
1164
0
    case FZ_STEXT_BLOCK_STRUCT:
1165
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "structure");
1166
0
      fz_write_printf(ctx, out, "%q:%d", "index", block->u.s.index);
1167
0
      if (block->u.s.down)
1168
0
      {
1169
0
        fz_write_printf(ctx, out, ",%q:%q", "raw", block->u.s.down->raw);
1170
0
        fz_write_printf(ctx, out, ",%q:%q", "std", fz_structure_to_string(block->u.s.down->standard));
1171
0
        fz_write_printf(ctx, out, ",%q:[", "contents");
1172
0
        as_json(ctx, block->u.s.down->first_block, out, scale);
1173
0
        fz_write_printf(ctx, out, "]");
1174
0
      }
1175
0
      fz_write_printf(ctx, out, "}");
1176
0
      break;
1177
1178
0
    }
1179
0
    block = block->next;
1180
0
  }
1181
0
}
1182
1183
void
1184
fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
1185
0
{
1186
0
  fz_write_printf(ctx, out, "{%q:[", "blocks");
1187
1188
0
  as_json(ctx, page->first_block, out, scale);
1189
1190
0
  fz_write_string(ctx, out, "]}");
1191
0
}
1192
1193
/* Plain text */
1194
1195
static void
1196
do_as_text(fz_context *ctx, fz_output *out, fz_stext_block *first_block)
1197
0
{
1198
0
  fz_stext_block *block;
1199
0
  fz_stext_line *line;
1200
0
  fz_stext_char *ch;
1201
0
  char utf[10];
1202
0
  int i, n;
1203
1204
0
  for (block = first_block; block; block = block->next)
1205
0
  {
1206
0
    switch (block->type)
1207
0
    {
1208
0
    case FZ_STEXT_BLOCK_TEXT:
1209
0
      for (line = block->u.t.first_line; line; line = line->next)
1210
0
      {
1211
0
        int break_line = 1;
1212
0
        for (ch = line->first_char; ch; ch = ch->next)
1213
0
        {
1214
0
          if (ch->next == NULL && (line->flags & FZ_STEXT_LINE_FLAGS_JOINED) != 0)
1215
0
          {
1216
0
            break_line = 0;
1217
0
            continue;
1218
0
          }
1219
0
          n = fz_runetochar(utf, ch->c);
1220
0
          for (i = 0; i < n; i++)
1221
0
            fz_write_byte(ctx, out, utf[i]);
1222
0
        }
1223
0
        if (break_line)
1224
0
          fz_write_string(ctx, out, "\n");
1225
0
      }
1226
0
      fz_write_string(ctx, out, "\n");
1227
0
      break;
1228
0
    case FZ_STEXT_BLOCK_STRUCT:
1229
0
      if (block->u.s.down != NULL)
1230
0
        do_as_text(ctx, out, block->u.s.down->first_block);
1231
0
      break;
1232
0
    }
1233
0
  }
1234
0
}
1235
1236
void
1237
fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
1238
0
{
1239
0
  do_as_text(ctx, out, page->first_block);
1240
0
}
1241
1242
/* Text output writer */
1243
1244
enum {
1245
  FZ_FORMAT_TEXT,
1246
  FZ_FORMAT_HTML,
1247
  FZ_FORMAT_XHTML,
1248
  FZ_FORMAT_STEXT_XML,
1249
  FZ_FORMAT_STEXT_JSON,
1250
};
1251
1252
typedef struct
1253
{
1254
  fz_document_writer super;
1255
  int format;
1256
  int number;
1257
  fz_stext_options opts;
1258
  fz_stext_page *page;
1259
  fz_output *out;
1260
} fz_text_writer;
1261
1262
static fz_device *
1263
text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
1264
0
{
1265
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1266
0
  float s = wri->opts.scale;
1267
1268
0
  if (wri->page)
1269
0
  {
1270
0
    fz_drop_stext_page(ctx, wri->page);
1271
0
    wri->page = NULL;
1272
0
  }
1273
1274
0
  wri->number++;
1275
1276
0
  wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s)));
1277
0
  return fz_new_stext_device(ctx, wri->page, &wri->opts);
1278
0
}
1279
1280
static void
1281
text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
1282
0
{
1283
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1284
0
  float s = wri->opts.scale;
1285
1286
0
  fz_scale_stext_page(ctx, wri->page, s);
1287
1288
0
  fz_try(ctx)
1289
0
  {
1290
0
    fz_close_device(ctx, dev);
1291
0
    switch (wri->format)
1292
0
    {
1293
0
    default:
1294
0
    case FZ_FORMAT_TEXT:
1295
0
      fz_print_stext_page_as_text(ctx, wri->out, wri->page);
1296
0
      break;
1297
0
    case FZ_FORMAT_HTML:
1298
0
      fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
1299
0
      break;
1300
0
    case FZ_FORMAT_XHTML:
1301
0
      fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
1302
0
      break;
1303
0
    case FZ_FORMAT_STEXT_XML:
1304
0
      fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
1305
0
      break;
1306
0
    case FZ_FORMAT_STEXT_JSON:
1307
0
      if (wri->number > 1)
1308
0
        fz_write_string(ctx, wri->out, ",");
1309
0
      fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1);
1310
0
      break;
1311
0
    }
1312
0
  }
1313
0
  fz_always(ctx)
1314
0
  {
1315
0
    fz_drop_device(ctx, dev);
1316
0
    fz_drop_stext_page(ctx, wri->page);
1317
0
    wri->page = NULL;
1318
0
  }
1319
0
  fz_catch(ctx)
1320
0
    fz_rethrow(ctx);
1321
0
}
1322
1323
static void
1324
text_close_writer(fz_context *ctx, fz_document_writer *wri_)
1325
0
{
1326
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1327
0
  switch (wri->format)
1328
0
  {
1329
0
  case FZ_FORMAT_HTML:
1330
0
    fz_print_stext_trailer_as_html(ctx, wri->out);
1331
0
    break;
1332
0
  case FZ_FORMAT_XHTML:
1333
0
    fz_print_stext_trailer_as_xhtml(ctx, wri->out);
1334
0
    break;
1335
0
  case FZ_FORMAT_STEXT_XML:
1336
0
    fz_write_string(ctx, wri->out, "</document>\n");
1337
0
    break;
1338
0
  case FZ_FORMAT_STEXT_JSON:
1339
0
    fz_write_string(ctx, wri->out, "]\n");
1340
0
    break;
1341
0
  }
1342
0
  fz_close_output(ctx, wri->out);
1343
0
}
1344
1345
static void
1346
text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
1347
0
{
1348
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
1349
0
  fz_drop_stext_page(ctx, wri->page);
1350
0
  fz_drop_output(ctx, wri->out);
1351
0
}
1352
1353
fz_document_writer *
1354
fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options)
1355
0
{
1356
0
  fz_text_writer *wri = NULL;
1357
1358
0
  fz_var(wri);
1359
1360
0
  fz_try(ctx)
1361
0
  {
1362
0
    wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
1363
0
    fz_parse_stext_options(ctx, &wri->opts, options);
1364
1365
0
    wri->format = FZ_FORMAT_TEXT;
1366
0
    if (!strcmp(format, "text"))
1367
0
      wri->format = FZ_FORMAT_TEXT;
1368
0
    else if (!strcmp(format, "html"))
1369
0
      wri->format = FZ_FORMAT_HTML;
1370
0
    else if (!strcmp(format, "xhtml"))
1371
0
      wri->format = FZ_FORMAT_XHTML;
1372
0
    else if (!strcmp(format, "stext"))
1373
0
      wri->format = FZ_FORMAT_STEXT_XML;
1374
0
    else if (!strcmp(format, "stext.xml"))
1375
0
      wri->format = FZ_FORMAT_STEXT_XML;
1376
0
    else if (!strcmp(format, "stext.json"))
1377
0
    {
1378
0
      wri->format = FZ_FORMAT_STEXT_JSON;
1379
0
      wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS;
1380
0
    }
1381
1382
0
    wri->out = out;
1383
1384
0
    switch (wri->format)
1385
0
    {
1386
0
    case FZ_FORMAT_HTML:
1387
0
      fz_print_stext_header_as_html(ctx, wri->out);
1388
0
      break;
1389
0
    case FZ_FORMAT_XHTML:
1390
0
      fz_print_stext_header_as_xhtml(ctx, wri->out);
1391
0
      break;
1392
0
    case FZ_FORMAT_STEXT_XML:
1393
0
      fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
1394
0
      fz_write_string(ctx, wri->out, "<document>\n");
1395
0
      break;
1396
0
    case FZ_FORMAT_STEXT_JSON:
1397
0
      fz_write_string(ctx, wri->out, "[");
1398
0
      break;
1399
0
    }
1400
0
  }
1401
0
  fz_catch(ctx)
1402
0
  {
1403
0
    fz_drop_output(ctx, out);
1404
0
    fz_free(ctx, wri);
1405
0
    fz_rethrow(ctx);
1406
0
  }
1407
1408
0
  return (fz_document_writer*)wri;
1409
0
}
1410
1411
fz_document_writer *
1412
fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options)
1413
0
{
1414
0
  fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
1415
0
  return fz_new_text_writer_with_output(ctx, format, out, options);
1416
0
}