Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/source/fitz/stext-output.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2021 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
25
#define SUBSCRIPT_OFFSET 0.2f
26
#define SUPERSCRIPT_OFFSET -0.2f
27
28
#include <ft2build.h>
29
#include FT_FREETYPE_H
30
31
// Text black color when converted from DeviceCMYK to RGB
32
0
#define CMYK_BLACK 0x221f1f
33
34
static void fz_scale_stext_page(fz_context *ctx, fz_stext_page *page, float scale)
35
0
{
36
0
  fz_matrix m = fz_scale(scale, scale);
37
0
  fz_stext_block *block;
38
0
  fz_stext_line *line;
39
0
  fz_stext_char *ch;
40
41
0
  for (block = page->first_block; block; block = block->next)
42
0
  {
43
0
    block->bbox = fz_transform_rect(block->bbox, m);
44
0
    switch (block->type)
45
0
    {
46
0
    case FZ_STEXT_BLOCK_TEXT:
47
0
      for (line = block->u.t.first_line; line; line = line->next)
48
0
      {
49
0
        line->bbox = fz_transform_rect(block->bbox, m);
50
0
        for (ch = line->first_char; ch; ch = ch->next)
51
0
        {
52
0
          ch->origin = fz_transform_point(ch->origin, m);
53
0
          ch->quad = fz_transform_quad(ch->quad, m);
54
0
          ch->size = ch->size * scale;
55
0
        }
56
0
      }
57
0
      break;
58
59
0
    case FZ_STEXT_BLOCK_IMAGE:
60
0
      block->u.i.transform = fz_post_scale(block->u.i.transform, scale, scale);
61
0
      break;
62
0
    }
63
0
  }
64
0
}
65
66
/* HTML output (visual formatting with preserved layout) */
67
68
static int
69
detect_super_script(fz_stext_line *line, fz_stext_char *ch)
70
0
{
71
0
  if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
72
0
    return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
73
0
  return 0;
74
0
}
75
76
static const char *
77
font_full_name(fz_context *ctx, fz_font *font)
78
0
{
79
0
  const char *name = fz_font_name(ctx, font);
80
0
  const char *s = strchr(name, '+');
81
0
  return s ? s + 1 : name;
82
0
}
83
84
static const char *
85
html_clean_font_name(const char *fontname)
86
0
{
87
0
  if (strstr(fontname, "Times"))
88
0
    return "Times New Roman";
89
0
  if (strstr(fontname, "Arial") || strstr(fontname, "Helvetica"))
90
0
  {
91
0
    if (strstr(fontname, "Narrow") || strstr(fontname, "Condensed"))
92
0
      return "Arial Narrow";
93
0
    return "Arial";
94
0
  }
95
0
  if (strstr(fontname, "Courier"))
96
0
    return "Courier";
97
0
  return fontname;
98
0
}
99
100
static void
101
font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
102
0
{
103
0
  const char *name = html_clean_font_name(font_full_name(ctx, font));
104
0
  char *s;
105
0
  fz_strlcpy(buf, name, size);
106
0
  s = strrchr(buf, '-');
107
0
  if (s)
108
0
    *s = 0;
109
0
  if (is_mono)
110
0
    fz_strlcat(buf, ",monospace", size);
111
0
  else
112
0
    fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
113
0
}
114
115
static void
116
fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
117
0
{
118
0
  char family[80];
119
120
0
  int is_bold = fz_font_is_bold(ctx, font);
121
0
  int is_italic = fz_font_is_italic(ctx, font);
122
0
  int is_serif = fz_font_is_serif(ctx, font);
123
0
  int is_mono = fz_font_is_monospaced(ctx, font);
124
125
0
  font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
126
127
0
  if (sup) fz_write_string(ctx, out, "<sup>");
128
0
  if (is_mono) fz_write_string(ctx, out, "<tt>");
129
0
  if (is_bold) fz_write_string(ctx, out, "<b>");
130
0
  if (is_italic) fz_write_string(ctx, out, "<i>");
131
0
  fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%.1fpt", family, size);
132
0
  if (color != 0 && color != CMYK_BLACK)
133
0
    fz_write_printf(ctx, out, ";color:#%06x", color);
134
0
  fz_write_printf(ctx, out, "\">");
135
0
}
136
137
static void
138
fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
139
0
{
140
0
  int is_mono = fz_font_is_monospaced(ctx, font);
141
0
  int is_bold = fz_font_is_bold(ctx,font);
142
0
  int is_italic = fz_font_is_italic(ctx, font);
143
144
0
  fz_write_string(ctx, out, "</span>");
145
0
  if (is_italic) fz_write_string(ctx, out, "</i>");
146
0
  if (is_bold) fz_write_string(ctx, out, "</b>");
147
0
  if (is_mono) fz_write_string(ctx, out, "</tt>");
148
0
  if (sup) fz_write_string(ctx, out, "</sup>");
149
0
}
150
151
static void
152
fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
153
0
{
154
0
  fz_matrix ctm = block->u.i.transform;
155
156
0
#define USE_CSS_MATRIX_TRANSFORMS
157
0
#ifdef USE_CSS_MATRIX_TRANSFORMS
158
  /* Matrix maths notes.
159
   * When we get here ctm maps the unit square to the position in device
160
   * space occupied by the image.
161
   *
162
   * That is to say that mapping the 4 corners of the unit square through
163
   * the transform, give us the 4 target corners. We extend the corners
164
   * by adding an extra '1' into them to allow transforms to work. Thus
165
   * (x,y) maps through ctm = (a b c d e f) as:
166
   *
167
   * (x y 1) (a b 0) = (X Y 1)
168
   *         (c d 0)
169
   *         (e f 1)
170
   *
171
   * To simplify reading of matrix maths, we use the trick where we
172
   * 'drop' the first matrix down the page. Thus the corners c0=(0,0),
173
   * c1=(1,0), c2=(1,1), c3=(0,1) map to C0, C1, C2, C3 respectively:
174
   *
175
   *         (    a     b 0)
176
   *         (    c     d 0)
177
   *         (    e     f 1)
178
   * (0 0 1) (    e     f 1)
179
   * (0 1 1) (  c+e   d+f 1)
180
   * (1 1 1) (a+c+e b+d+f 1)
181
   * (1 0 1) (  a+e   b+f 1)
182
   *
183
   * where C0 = (e,f), C1=(c+e, d+f) C2=(a+c+e, b+d+f), C3=(a+e, b+f)
184
   *
185
   * Unfortunately, the CSS matrix transform, does not map the unit square.
186
   * Rather it does something moderately mad. As far as I can work out, the
187
   * top left corner of a (0,0) -> (w, h) box is transformed using the .e
188
   * and .f entries of the matrix. Then the image from within that square
189
   * is transformed using the centre of that square as the origin.
190
   *
191
   * So, an image placed at (0,0) in destination space with 1:1 transform
192
   * will result in an image a (0,0) as you'd expect. But an image at (0,0)
193
   * with a scale of 2, will result in 25% of the image off the left of the
194
   * screen, and 25% off the top.
195
   *
196
   * Accordingly, we have to adjust the ctm in several steps.
197
   */
198
  /* Move to moving the centre of the image. */
199
0
  ctm.e += (ctm.a+ctm.c)/2;
200
0
  ctm.f += (ctm.b+ctm.d)/2;
201
  /* Move from transforming the unit square to w/h */
202
0
  ctm.a /= block->u.i.image->w;
203
0
  ctm.b /= block->u.i.image->w;
204
0
  ctm.c /= block->u.i.image->h;
205
0
  ctm.d /= block->u.i.image->h;
206
  /* Move from points to pixels */
207
0
  ctm.a *= 96.0f/72;
208
0
  ctm.b *= 96.0f/72;
209
0
  ctm.c *= 96.0f/72;
210
0
  ctm.d *= 96.0f/72;
211
0
  ctm.e *= 96.0f/72;
212
0
  ctm.f *= 96.0f/72;
213
  /* Move to moving the top left of the untransformed image box, cos HTML is bonkers. */
214
0
  ctm.e -= block->u.i.image->w/2;
215
0
  ctm.f -= block->u.i.image->h/2;
216
217
0
  fz_write_printf(ctx, out, "<img style=\"position:absolute;transform:matrix(%g,%g,%g,%g,%g,%g)\" src=\"",
218
0
    ctm.a, ctm.b, ctm.c, ctm.d, ctm.e, ctm.f);
219
#else
220
  /* Alternative version of the code that uses scaleX/Y and rotate
221
   * instead, but only copes with axis aligned cases. */
222
  int t;
223
224
  int x = block->bbox.x0;
225
  int y = block->bbox.y0;
226
  int w = block->bbox.x1 - block->bbox.x0;
227
  int h = block->bbox.y1 - block->bbox.y0;
228
229
  const char *flip = "";
230
231
  if (ctm.b == 0 && ctm.c == 0)
232
  {
233
    if (ctm.a < 0 && ctm.d < 0)
234
      flip = "transform: scaleX(-1) scaleY(-1);";
235
    else if (ctm.a < 0)
236
    {
237
      flip = "transform: scaleX(-1);";
238
    }
239
    else if (ctm.d < 0)
240
    {
241
      flip = "transform: scaleY(-1);";
242
    }
243
  } else if (ctm.a == 0 && ctm.d == 0) {
244
    if (ctm.b < 0 && ctm.c < 0)
245
    {
246
      flip = "transform: scaleY(-1) rotate(90deg);";
247
      x += (w-h)/2;
248
      y -= (w-h)/2;
249
      t = w; w = h; h = t;
250
    }
251
    else if (ctm.b < 0)
252
    {
253
      flip = "transform: scaleX(-1) scaleY(-1) rotate(90deg);";
254
      x += (w-h)/2;
255
      y -= (w-h)/2;
256
      t = w; w = h; h = t;
257
    }
258
    else if (ctm.c < 0)
259
    {
260
      flip = "transform: scaleX(-1) scaleY(-1) rotate(270deg);";
261
      x += (w-h)/2;
262
      y -= (w-h)/2;
263
      t = w; w = h; h = t;
264
    }
265
    else
266
    {
267
      flip = "transform: scaleY(-1) rotate(270deg);";
268
      x += (w-h)/2;
269
      y -= (w-h)/2;
270
      t = w; w = h; h = t;
271
    }
272
  }
273
274
  fz_write_printf(ctx, out, "<img style=\"position:absolute;%stop:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", flip, y, x, w, h);
275
#endif
276
0
  fz_write_image_as_data_uri(ctx, out, block->u.i.image);
277
0
  fz_write_string(ctx, out, "\">\n");
278
0
}
279
280
void
281
fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
282
0
{
283
0
  fz_stext_line *line;
284
0
  fz_stext_char *ch;
285
0
  float x, y, h;
286
287
0
  fz_font *font = NULL;
288
0
  float size = 0;
289
0
  int sup = 0;
290
0
  int color = 0;
291
292
0
  for (line = block->u.t.first_line; line; line = line->next)
293
0
  {
294
0
    x = line->bbox.x0;
295
0
    y = line->bbox.y0;
296
0
    h = line->bbox.y1 - line->bbox.y0;
297
298
0
    if (line->first_char)
299
0
    {
300
0
      h = line->first_char->size;
301
0
      y = line->first_char->origin.y - h * 0.8f;
302
0
    }
303
304
0
    fz_write_printf(ctx, out, "<p style=\"top:%.1fpt;left:%.1fpt;line-height:%.1fpt\">", y, x, h);
305
0
    font = NULL;
306
307
0
    for (ch = line->first_char; ch; ch = ch->next)
308
0
    {
309
0
      int ch_sup = detect_super_script(line, ch);
310
0
      if (ch->font != font || ch->size != size || ch_sup != sup || ch->color != color)
311
0
      {
312
0
        if (font)
313
0
          fz_print_style_end_html(ctx, out, font, size, sup, color);
314
0
        font = ch->font;
315
0
        size = ch->size;
316
0
        color = ch->color;
317
0
        sup = ch_sup;
318
0
        fz_print_style_begin_html(ctx, out, font, size, sup, color);
319
0
      }
320
321
0
      switch (ch->c)
322
0
      {
323
0
      default:
324
0
        if (ch->c >= 32 && ch->c <= 127)
325
0
          fz_write_byte(ctx, out, ch->c);
326
0
        else
327
0
          fz_write_printf(ctx, out, "&#x%x;", ch->c);
328
0
        break;
329
0
      case '<': fz_write_string(ctx, out, "&lt;"); break;
330
0
      case '>': fz_write_string(ctx, out, "&gt;"); break;
331
0
      case '&': fz_write_string(ctx, out, "&amp;"); break;
332
0
      case '"': fz_write_string(ctx, out, "&quot;"); break;
333
0
      case '\'': fz_write_string(ctx, out, "&apos;"); break;
334
0
      }
335
0
    }
336
337
0
    if (font)
338
0
      fz_print_style_end_html(ctx, out, font, size, sup, color);
339
340
0
    fz_write_string(ctx, out, "</p>\n");
341
0
  }
342
0
}
343
344
void
345
fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
346
0
{
347
0
  fz_stext_block *block;
348
349
0
  float w = page->mediabox.x1 - page->mediabox.x0;
350
0
  float h = page->mediabox.y1 - page->mediabox.y0;
351
352
0
  fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"width:%.1fpt;height:%.1fpt\">\n", id, w, h);
353
354
0
  for (block = page->first_block; block; block = block->next)
355
0
  {
356
0
    if (block->type == FZ_STEXT_BLOCK_IMAGE)
357
0
      fz_print_stext_image_as_html(ctx, out, block);
358
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
359
0
      fz_print_stext_block_as_html(ctx, out, block);
360
0
  }
361
362
0
  fz_write_string(ctx, out, "</div>\n");
363
0
}
364
365
void
366
fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
367
0
{
368
0
  fz_write_string(ctx, out, "<!DOCTYPE html>\n");
369
0
  fz_write_string(ctx, out, "<html>\n");
370
0
  fz_write_string(ctx, out, "<head>\n");
371
0
  fz_write_string(ctx, out, "<style>\n");
372
0
  fz_write_string(ctx, out, "body{background-color:slategray}\n");
373
0
  fz_write_string(ctx, out, "div{position:relative;background-color:white;margin:1em auto;box-shadow:1px 1px 8px -2px black}\n");
374
0
  fz_write_string(ctx, out, "p{position:absolute;white-space:pre;margin:0}\n");
375
0
  fz_write_string(ctx, out, "</style>\n");
376
0
  fz_write_string(ctx, out, "</head>\n");
377
0
  fz_write_string(ctx, out, "<body>\n");
378
0
}
379
380
void
381
fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
382
0
{
383
0
  fz_write_string(ctx, out, "</body>\n");
384
0
  fz_write_string(ctx, out, "</html>\n");
385
0
}
386
387
/* XHTML output (semantic, little layout, suitable for reflow) */
388
389
static void
390
fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
391
0
{
392
0
  int w = block->bbox.x1 - block->bbox.x0;
393
0
  int h = block->bbox.y1 - block->bbox.y0;
394
395
0
  fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
396
0
  fz_write_image_as_data_uri(ctx, out, block->u.i.image);
397
0
  fz_write_string(ctx, out, "\"/></p>\n");
398
0
}
399
400
static void
401
fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
402
0
{
403
0
  int is_mono = fz_font_is_monospaced(ctx, font);
404
0
  int is_bold = fz_font_is_bold(ctx, font);
405
0
  int is_italic = fz_font_is_italic(ctx, font);
406
407
0
  if (sup)
408
0
    fz_write_string(ctx, out, "<sup>");
409
0
  if (is_mono)
410
0
    fz_write_string(ctx, out, "<tt>");
411
0
  if (is_bold)
412
0
    fz_write_string(ctx, out, "<b>");
413
0
  if (is_italic)
414
0
    fz_write_string(ctx, out, "<i>");
415
0
}
416
417
static void
418
fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
419
0
{
420
0
  int is_mono = fz_font_is_monospaced(ctx, font);
421
0
  int is_bold = fz_font_is_bold(ctx, font);
422
0
  int is_italic = fz_font_is_italic(ctx, font);
423
424
0
  if (is_italic)
425
0
    fz_write_string(ctx, out, "</i>");
426
0
  if (is_bold)
427
0
    fz_write_string(ctx, out, "</b>");
428
0
  if (is_mono)
429
0
    fz_write_string(ctx, out, "</tt>");
430
0
  if (sup)
431
0
    fz_write_string(ctx, out, "</sup>");
432
0
}
433
434
static float avg_font_size_of_line(fz_stext_char *ch)
435
0
{
436
0
  float size = 0;
437
0
  int n = 0;
438
0
  if (!ch)
439
0
    return 0;
440
0
  while (ch)
441
0
  {
442
0
    size += ch->size;
443
0
    ++n;
444
0
    ch = ch->next;
445
0
  }
446
0
  return size / n;
447
0
}
448
449
static const char *tag_from_font_size(float size)
450
0
{
451
0
  if (size >= 20) return "h1";
452
0
  if (size >= 15) return "h2";
453
0
  if (size >= 12) return "h3";
454
0
  return "p";
455
0
}
456
457
static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
458
0
{
459
0
  fz_stext_line *line;
460
0
  fz_stext_char *ch;
461
462
0
  fz_font *font = NULL;
463
0
  int sup = 0;
464
0
  int sp = 1;
465
0
  const char *tag = NULL;
466
0
  const char *new_tag;
467
468
0
  for (line = block->u.t.first_line; line; line = line->next)
469
0
  {
470
0
    new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
471
0
    if (tag != new_tag)
472
0
    {
473
0
      if (tag)
474
0
      {
475
0
        if (font)
476
0
          fz_print_style_end_xhtml(ctx, out, font, sup);
477
0
        fz_write_printf(ctx, out, "</%s>", tag);
478
0
      }
479
0
      tag = new_tag;
480
0
      fz_write_printf(ctx, out, "<%s>", tag);
481
0
      if (font)
482
0
        fz_print_style_begin_xhtml(ctx, out, font, sup);
483
0
    }
484
485
0
    if (!sp)
486
0
      fz_write_byte(ctx, out, ' ');
487
488
0
    for (ch = line->first_char; ch; ch = ch->next)
489
0
    {
490
0
      int ch_sup = detect_super_script(line, ch);
491
0
      if (ch->font != font || ch_sup != sup)
492
0
      {
493
0
        if (font)
494
0
          fz_print_style_end_xhtml(ctx, out, font, sup);
495
0
        font = ch->font;
496
0
        sup = ch_sup;
497
0
        fz_print_style_begin_xhtml(ctx, out, font, sup);
498
0
      }
499
500
0
      sp = (ch->c == ' ');
501
0
      switch (ch->c)
502
0
      {
503
0
      default:
504
0
        if (ch->c >= 32 && ch->c <= 127)
505
0
          fz_write_byte(ctx, out, ch->c);
506
0
        else
507
0
          fz_write_printf(ctx, out, "&#x%x;", ch->c);
508
0
        break;
509
0
      case '<': fz_write_string(ctx, out, "&lt;"); break;
510
0
      case '>': fz_write_string(ctx, out, "&gt;"); break;
511
0
      case '&': fz_write_string(ctx, out, "&amp;"); break;
512
0
      case '"': fz_write_string(ctx, out, "&quot;"); break;
513
0
      case '\'': fz_write_string(ctx, out, "&apos;"); break;
514
0
      }
515
0
    }
516
0
  }
517
518
0
  if (font)
519
0
    fz_print_style_end_xhtml(ctx, out, font, sup);
520
0
  fz_write_printf(ctx, out, "</%s>\n", tag);
521
0
}
522
523
void
524
fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
525
0
{
526
0
  fz_stext_block *block;
527
528
0
  fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
529
530
0
  for (block = page->first_block; block; block = block->next)
531
0
  {
532
0
    if (block->type == FZ_STEXT_BLOCK_IMAGE)
533
0
      fz_print_stext_image_as_xhtml(ctx, out, block);
534
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
535
0
      fz_print_stext_block_as_xhtml(ctx, out, block);
536
0
  }
537
538
0
  fz_write_string(ctx, out, "</div>\n");
539
0
}
540
541
void
542
fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
543
0
{
544
0
  fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
545
0
  fz_write_string(ctx, out, "<!DOCTYPE html");
546
0
  fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
547
0
  fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
548
0
  fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
549
0
  fz_write_string(ctx, out, "<head>\n");
550
0
  fz_write_string(ctx, out, "<style>\n");
551
0
  fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
552
0
  fz_write_string(ctx, out, "</style>\n");
553
0
  fz_write_string(ctx, out, "</head>\n");
554
0
  fz_write_string(ctx, out, "<body>\n");
555
0
}
556
557
void
558
fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
559
0
{
560
0
  fz_write_string(ctx, out, "</body>\n");
561
0
  fz_write_string(ctx, out, "</html>\n");
562
0
}
563
564
/* Detailed XML dump of the entire structured text data */
565
566
void
567
fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
568
0
{
569
0
  fz_stext_block *block;
570
0
  fz_stext_line *line;
571
0
  fz_stext_char *ch;
572
573
0
  fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
574
0
    page->mediabox.x1 - page->mediabox.x0,
575
0
    page->mediabox.y1 - page->mediabox.y0);
576
577
0
  for (block = page->first_block; block; block = block->next)
578
0
  {
579
0
    switch (block->type)
580
0
    {
581
0
    case FZ_STEXT_BLOCK_TEXT:
582
0
      fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n",
583
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
584
0
      for (line = block->u.t.first_line; line; line = line->next)
585
0
      {
586
0
        fz_font *font = NULL;
587
0
        float size = 0;
588
0
        const char *name = NULL;
589
590
0
        fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\">\n",
591
0
            line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
592
0
            line->wmode,
593
0
            line->dir.x, line->dir.y);
594
595
0
        for (ch = line->first_char; ch; ch = ch->next)
596
0
        {
597
0
          if (ch->font != font || ch->size != size)
598
0
          {
599
0
            if (font)
600
0
              fz_write_string(ctx, out, "</font>\n");
601
0
            font = ch->font;
602
0
            size = ch->size;
603
0
            name = font_full_name(ctx, font);
604
0
            fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", name, size);
605
0
          }
606
0
          fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" bidi=\"%d\" color=\"#%06x\" c=\"",
607
0
              ch->quad.ul.x, ch->quad.ul.y,
608
0
              ch->quad.ur.x, ch->quad.ur.y,
609
0
              ch->quad.ll.x, ch->quad.ll.y,
610
0
              ch->quad.lr.x, ch->quad.lr.y,
611
0
              ch->origin.x, ch->origin.y,
612
0
              ch->bidi,
613
0
              ch->color);
614
0
          switch (ch->c)
615
0
          {
616
0
          case '<': fz_write_string(ctx, out, "&lt;"); break;
617
0
          case '>': fz_write_string(ctx, out, "&gt;"); break;
618
0
          case '&': fz_write_string(ctx, out, "&amp;"); break;
619
0
          case '"': fz_write_string(ctx, out, "&quot;"); break;
620
0
          case '\'': fz_write_string(ctx, out, "&apos;"); break;
621
0
          default:
622
0
               if (ch->c >= 32 && ch->c <= 127)
623
0
                 fz_write_printf(ctx, out, "%c", ch->c);
624
0
               else
625
0
                 fz_write_printf(ctx, out, "&#x%x;", ch->c);
626
0
               break;
627
0
          }
628
0
          fz_write_string(ctx, out, "\"/>\n");
629
0
        }
630
631
0
        if (font)
632
0
          fz_write_string(ctx, out, "</font>\n");
633
634
0
        fz_write_string(ctx, out, "</line>\n");
635
0
      }
636
0
      fz_write_string(ctx, out, "</block>\n");
637
0
      break;
638
639
0
    case FZ_STEXT_BLOCK_IMAGE:
640
0
      fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
641
0
          block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
642
0
      break;
643
0
    }
644
0
  }
645
0
  fz_write_string(ctx, out, "</page>\n");
646
0
}
647
648
/* JSON dump */
649
650
void
651
fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
652
0
{
653
0
  fz_stext_block *block;
654
0
  fz_stext_line *line;
655
0
  fz_stext_char *ch;
656
657
0
  fz_write_printf(ctx, out, "{%q:[", "blocks");
658
659
0
  for (block = page->first_block; block; block = block->next)
660
0
  {
661
0
    if (block != page->first_block)
662
0
      fz_write_string(ctx, out, ",");
663
0
    switch (block->type)
664
0
    {
665
0
    case FZ_STEXT_BLOCK_TEXT:
666
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "text");
667
0
      fz_write_printf(ctx, out, "%q:{", "bbox");
668
0
      fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
669
0
      fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
670
0
      fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
671
0
      fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
672
0
      fz_write_printf(ctx, out, "%q:[", "lines");
673
674
0
      for (line = block->u.t.first_line; line; line = line->next)
675
0
      {
676
0
        if (line != block->u.t.first_line)
677
0
          fz_write_string(ctx, out, ",");
678
0
        fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode);
679
0
        fz_write_printf(ctx, out, "%q:{", "bbox");
680
0
        fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale));
681
0
        fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale));
682
0
        fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale));
683
0
        fz_write_printf(ctx, out, "%q:%d},", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale));
684
685
        /* Since we force preserve-spans, the first char has the style for the entire line. */
686
0
        if (line->first_char)
687
0
        {
688
0
          fz_font *font = line->first_char->font;
689
0
          char *font_family = "sans-serif";
690
0
          char *font_weight = "normal";
691
0
          char *font_style = "normal";
692
0
          if (fz_font_is_monospaced(ctx, font)) font_family = "monospace";
693
0
          else if (fz_font_is_serif(ctx, font)) font_family = "serif";
694
0
          if (fz_font_is_bold(ctx, font)) font_weight = "bold";
695
0
          if (fz_font_is_italic(ctx, font)) font_style = "italic";
696
0
          fz_write_printf(ctx, out, "%q:{", "font");
697
0
          fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font));
698
0
          fz_write_printf(ctx, out, "%q:%q,", "family", font_family);
699
0
          fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight);
700
0
          fz_write_printf(ctx, out, "%q:%q,", "style", font_style);
701
0
          fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale));
702
0
          fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale));
703
0
          fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale));
704
0
        }
705
706
0
        fz_write_printf(ctx, out, "%q:\"", "text");
707
0
        for (ch = line->first_char; ch; ch = ch->next)
708
0
        {
709
0
          if (ch->c == '"' || ch->c == '\\')
710
0
            fz_write_printf(ctx, out, "\\%c", ch->c);
711
0
          else if (ch->c < 32)
712
0
            fz_write_printf(ctx, out, "\\u%04x", ch->c);
713
0
          else
714
0
            fz_write_printf(ctx, out, "%C", ch->c);
715
0
        }
716
0
        fz_write_printf(ctx, out, "\"}");
717
0
      }
718
0
      fz_write_string(ctx, out, "]}");
719
0
      break;
720
721
0
    case FZ_STEXT_BLOCK_IMAGE:
722
0
      fz_write_printf(ctx, out, "{%q:%q,", "type", "image");
723
0
      fz_write_printf(ctx, out, "%q:{", "bbox");
724
0
      fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
725
0
      fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
726
0
      fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
727
0
      fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
728
0
      break;
729
0
    }
730
0
  }
731
0
  fz_write_string(ctx, out, "]}");
732
0
}
733
734
/* Plain text */
735
736
void
737
fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
738
0
{
739
0
  fz_stext_block *block;
740
0
  fz_stext_line *line;
741
0
  fz_stext_char *ch;
742
0
  char utf[10];
743
0
  int i, n;
744
745
0
  for (block = page->first_block; block; block = block->next)
746
0
  {
747
0
    if (block->type == FZ_STEXT_BLOCK_TEXT)
748
0
    {
749
0
      for (line = block->u.t.first_line; line; line = line->next)
750
0
      {
751
0
        for (ch = line->first_char; ch; ch = ch->next)
752
0
        {
753
0
          n = fz_runetochar(utf, ch->c);
754
0
          for (i = 0; i < n; i++)
755
0
            fz_write_byte(ctx, out, utf[i]);
756
0
        }
757
0
        fz_write_string(ctx, out, "\n");
758
0
      }
759
0
      fz_write_string(ctx, out, "\n");
760
0
    }
761
0
  }
762
0
}
763
764
/* Text output writer */
765
766
enum {
767
  FZ_FORMAT_TEXT,
768
  FZ_FORMAT_HTML,
769
  FZ_FORMAT_XHTML,
770
  FZ_FORMAT_STEXT_XML,
771
  FZ_FORMAT_STEXT_JSON,
772
};
773
774
typedef struct
775
{
776
  fz_document_writer super;
777
  int format;
778
  int number;
779
  fz_stext_options opts;
780
  fz_stext_page *page;
781
  fz_output *out;
782
} fz_text_writer;
783
784
static fz_device *
785
text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
786
0
{
787
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
788
0
  float s = wri->opts.scale;
789
790
0
  if (wri->page)
791
0
  {
792
0
    fz_drop_stext_page(ctx, wri->page);
793
0
    wri->page = NULL;
794
0
  }
795
796
0
  wri->number++;
797
798
0
  wri->page = fz_new_stext_page(ctx, fz_transform_rect(mediabox, fz_scale(s, s)));
799
0
  return fz_new_stext_device(ctx, wri->page, &wri->opts);
800
0
}
801
802
static void
803
text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
804
0
{
805
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
806
0
  float s = wri->opts.scale;
807
808
0
  fz_scale_stext_page(ctx, wri->page, s);
809
810
0
  fz_try(ctx)
811
0
  {
812
0
    fz_close_device(ctx, dev);
813
0
    switch (wri->format)
814
0
    {
815
0
    default:
816
0
    case FZ_FORMAT_TEXT:
817
0
      fz_print_stext_page_as_text(ctx, wri->out, wri->page);
818
0
      break;
819
0
    case FZ_FORMAT_HTML:
820
0
      fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
821
0
      break;
822
0
    case FZ_FORMAT_XHTML:
823
0
      fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
824
0
      break;
825
0
    case FZ_FORMAT_STEXT_XML:
826
0
      fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
827
0
      break;
828
0
    case FZ_FORMAT_STEXT_JSON:
829
0
      if (wri->number > 1)
830
0
        fz_write_string(ctx, wri->out, ",");
831
0
      fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1);
832
0
      break;
833
0
    }
834
0
  }
835
0
  fz_always(ctx)
836
0
  {
837
0
    fz_drop_device(ctx, dev);
838
0
    fz_drop_stext_page(ctx, wri->page);
839
0
    wri->page = NULL;
840
0
  }
841
0
  fz_catch(ctx)
842
0
    fz_rethrow(ctx);
843
0
}
844
845
static void
846
text_close_writer(fz_context *ctx, fz_document_writer *wri_)
847
0
{
848
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
849
0
  switch (wri->format)
850
0
  {
851
0
  case FZ_FORMAT_HTML:
852
0
    fz_print_stext_trailer_as_html(ctx, wri->out);
853
0
    break;
854
0
  case FZ_FORMAT_XHTML:
855
0
    fz_print_stext_trailer_as_xhtml(ctx, wri->out);
856
0
    break;
857
0
  case FZ_FORMAT_STEXT_XML:
858
0
    fz_write_string(ctx, wri->out, "</document>\n");
859
0
    break;
860
0
  case FZ_FORMAT_STEXT_JSON:
861
0
    fz_write_string(ctx, wri->out, "]\n");
862
0
    break;
863
0
  }
864
0
  fz_close_output(ctx, wri->out);
865
0
}
866
867
static void
868
text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
869
0
{
870
0
  fz_text_writer *wri = (fz_text_writer*)wri_;
871
0
  fz_drop_stext_page(ctx, wri->page);
872
0
  fz_drop_output(ctx, wri->out);
873
0
}
874
875
fz_document_writer *
876
fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options)
877
0
{
878
0
  fz_text_writer *wri = NULL;
879
880
0
  fz_var(wri);
881
882
0
  fz_try(ctx)
883
0
  {
884
0
    wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
885
0
    fz_parse_stext_options(ctx, &wri->opts, options);
886
887
0
    wri->format = FZ_FORMAT_TEXT;
888
0
    if (!strcmp(format, "text"))
889
0
      wri->format = FZ_FORMAT_TEXT;
890
0
    else if (!strcmp(format, "html"))
891
0
      wri->format = FZ_FORMAT_HTML;
892
0
    else if (!strcmp(format, "xhtml"))
893
0
      wri->format = FZ_FORMAT_XHTML;
894
0
    else if (!strcmp(format, "stext"))
895
0
      wri->format = FZ_FORMAT_STEXT_XML;
896
0
    else if (!strcmp(format, "stext.xml"))
897
0
      wri->format = FZ_FORMAT_STEXT_XML;
898
0
    else if (!strcmp(format, "stext.json"))
899
0
    {
900
0
      wri->format = FZ_FORMAT_STEXT_JSON;
901
0
      wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS;
902
0
    }
903
904
0
    wri->out = out;
905
906
0
    switch (wri->format)
907
0
    {
908
0
    case FZ_FORMAT_HTML:
909
0
      fz_print_stext_header_as_html(ctx, wri->out);
910
0
      break;
911
0
    case FZ_FORMAT_XHTML:
912
0
      fz_print_stext_header_as_xhtml(ctx, wri->out);
913
0
      break;
914
0
    case FZ_FORMAT_STEXT_XML:
915
0
      fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
916
0
      fz_write_string(ctx, wri->out, "<document>\n");
917
0
      break;
918
0
    case FZ_FORMAT_STEXT_JSON:
919
0
      fz_write_string(ctx, wri->out, "[");
920
0
      break;
921
0
    }
922
0
  }
923
0
  fz_catch(ctx)
924
0
  {
925
0
    fz_drop_output(ctx, out);
926
0
    fz_free(ctx, wri);
927
0
    fz_rethrow(ctx);
928
0
  }
929
930
0
  return (fz_document_writer*)wri;
931
0
}
932
933
fz_document_writer *
934
fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options)
935
0
{
936
0
  fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
937
0
  return fz_new_text_writer_with_output(ctx, format, out, options);
938
0
}