Coverage Report

Created: 2025-01-11 06:55

/src/mupdf/source/fitz/stext-device.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2024 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
// You should have received a copy of the GNU Affero General Public License
15
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
16
//
17
// Alternative licensing terms are available from the licensor.
18
// For commercial licensing, see <https://www.artifex.com/> or contact
19
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
20
// CA 94129, USA, for further information.
21
22
#include "mupdf/fitz.h"
23
24
#include "glyphbox.h"
25
26
#include <float.h>
27
#include <string.h>
28
29
/* Simple layout structure */
30
31
fz_layout_block *fz_new_layout(fz_context *ctx)
32
0
{
33
0
  fz_pool *pool = fz_new_pool(ctx);
34
0
  fz_layout_block *block;
35
0
  fz_try(ctx)
36
0
  {
37
0
    block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
38
0
    block->pool = pool;
39
0
    block->head = NULL;
40
0
    block->tailp = &block->head;
41
0
  }
42
0
  fz_catch(ctx)
43
0
  {
44
0
    fz_drop_pool(ctx, pool);
45
0
    fz_rethrow(ctx);
46
0
  }
47
0
  return block;
48
0
}
49
50
void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
51
0
{
52
0
  if (block)
53
0
    fz_drop_pool(ctx, block->pool);
54
0
}
55
56
void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p)
57
0
{
58
0
  fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
59
0
  line->x = x;
60
0
  line->y = y;
61
0
  line->font_size = font_size;
62
0
  line->p = p;
63
0
  line->text = NULL;
64
0
  line->next = NULL;
65
0
  *block->tailp = line;
66
0
  block->tailp = &line->next;
67
0
  block->text_tailp = &line->text;
68
0
}
69
70
void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p)
71
0
{
72
0
  fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
73
0
  ch->x = x;
74
0
  ch->advance = advance;
75
0
  ch->p = p;
76
0
  ch->next = NULL;
77
0
  *block->text_tailp = ch;
78
0
  block->text_tailp = &ch->next;
79
0
}
80
81
/* Extract text into blocks and lines. */
82
83
0
#define PARAGRAPH_DIST 1.5f
84
0
#define SPACE_DIST 0.15f
85
0
#define SPACE_MAX_DIST 0.8f
86
0
#define BASE_MAX_DIST 0.8f
87
0
#define FAKE_BOLD_MAX_DIST 0.1f
88
89
/* We keep a stack of the different metatexts that apply at any
90
 * given point (normally none!). Whenever we get some content
91
 * with a metatext in force, we really want to update the bounds
92
 * for that metatext. But running along the whole list each time
93
 * would be painful. So we just update the bounds for dev->metatext
94
 * and rely on metatext_bounds() propagating it upwards 'just in
95
 * time' for us to use metatexts other than the latest one. This
96
 * also means we need to propagate bounds upwards when we pop
97
 * a metatext.
98
 *
99
 * Why do we need bounds at all? Well, suppose we get:
100
 *    /Span <</ActualText (c) >> BDC /Im0 Do EMC
101
 * Then where on the page do we put 'c' ? By collecting the
102
 * bounds, we can place 'c' wherever the image was.
103
 */
104
typedef struct metatext_t
105
{
106
  fz_metatext type;
107
  char *text;
108
  fz_rect bounds;
109
  struct metatext_t *prev;
110
} metatext_t;
111
112
typedef struct
113
{
114
  fz_device super;
115
  fz_stext_page *page;
116
  fz_point pen, start;
117
  fz_point lag_pen;
118
  fz_matrix trm;
119
  int new_obj;
120
  int lastchar;
121
  int lastbidi;
122
  int flags;
123
  int color;
124
  int last_was_fake_bold;
125
  const fz_text *lasttext;
126
  fz_stext_options opts;
127
128
  metatext_t *metatext;
129
130
  /* Store the last values we saw. We need this for flushing the actualtext. */
131
  struct
132
  {
133
    int valid;
134
    int clipped;
135
    fz_matrix trm;
136
    int wmode;
137
    int bidi_level;
138
    fz_font *font;
139
    int flags;
140
  } last;
141
} fz_stext_device;
142
143
const char *fz_stext_options_usage =
144
  "Text output options:\n"
145
  "\tinhibit-spaces: don't add spaces between gaps in the text\n"
146
  "\tpreserve-images: keep images in output\n"
147
  "\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
148
  "\tpreserve-whitespace: do not convert all whitespace into space characters\n"
149
  "\tpreserve-spans: do not merge spans on the same line\n"
150
  "\tdehyphenate: attempt to join up hyphenated words\n"
151
  "\tuse-cid-for-unknown-unicode: guess unicode from cid if normal mapping fails\n"
152
  "\tclip: do not include text that is completely clipped\n"
153
  "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n"
154
  "\tstructured=no: don't collect structure data\n"
155
  "\taccurate-bboxes=no: calculate char bboxes for from the outlines\n"
156
  "\tvectors=no: include vector bboxes in output\n"
157
  "\tsegment=no: don't attempt to segment the page\n"
158
  "\ttable-hunt: hunt for tables within a (segmented) page\n"
159
  "\tcollect-flags: attempt to detect text features (fake bold, strikeout, underlined etc)\n"
160
  "\n";
161
162
/* Find the current actualtext, if any. Will abort if dev == NULL. */
163
static metatext_t *
164
find_actualtext(fz_stext_device *dev)
165
0
{
166
0
  metatext_t *mt = dev->metatext;
167
168
0
  while (mt && mt->type != FZ_METATEXT_ACTUALTEXT)
169
0
    mt = mt->prev;
170
171
0
  return mt;
172
0
}
173
174
/* Find the bounds of the given metatext. Will abort if mt or
175
 * dev are NULL. */
176
static fz_rect *
177
metatext_bounds(metatext_t *mt, fz_stext_device *dev)
178
0
{
179
0
  metatext_t *mt2 = dev->metatext;
180
181
0
  while (mt2 != mt)
182
0
  {
183
0
    mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds);
184
0
    mt2 = mt2->prev;
185
0
  }
186
187
0
  return &mt->bounds;
188
0
}
189
190
/* Find the bounds of the current actualtext, or NULL if there
191
 * isn't one. Will abort if dev is NULL. */
192
static fz_rect *
193
actualtext_bounds(fz_stext_device *dev)
194
0
{
195
0
  metatext_t *mt = find_actualtext(dev);
196
197
0
  if (mt == NULL)
198
0
    return NULL;
199
200
0
  return metatext_bounds(mt, dev);
201
0
}
202
203
fz_stext_page *
204
fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
205
0
{
206
0
  fz_pool *pool = fz_new_pool(ctx);
207
0
  fz_stext_page *page = NULL;
208
0
  fz_try(ctx)
209
0
  {
210
0
    page = fz_pool_alloc(ctx, pool, sizeof(*page));
211
0
    page->pool = pool;
212
0
    page->mediabox = mediabox;
213
0
    page->first_block = NULL;
214
0
    page->last_block = NULL;
215
0
  }
216
0
  fz_catch(ctx)
217
0
  {
218
0
    fz_drop_pool(ctx, pool);
219
0
    fz_rethrow(ctx);
220
0
  }
221
0
  return page;
222
0
}
223
224
static void
225
drop_run(fz_context *ctx, fz_stext_block *block)
226
0
{
227
0
  fz_stext_line *line;
228
0
  fz_stext_char *ch;
229
0
  while (block)
230
0
  {
231
0
    switch (block->type)
232
0
    {
233
0
    case FZ_STEXT_BLOCK_IMAGE:
234
0
      fz_drop_image(ctx, block->u.i.image);
235
0
      break;
236
0
    case FZ_STEXT_BLOCK_TEXT:
237
0
      for (line = block->u.t.first_line; line; line = line->next)
238
0
        for (ch = line->first_char; ch; ch = ch->next)
239
0
          fz_drop_font(ctx, ch->font);
240
0
      break;
241
0
    case FZ_STEXT_BLOCK_STRUCT:
242
0
      drop_run(ctx, block->u.s.down->first_block);
243
0
      break;
244
0
    default:
245
0
      break;
246
0
    }
247
0
    block = block->next;
248
0
  }
249
0
}
250
251
void
252
fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
253
0
{
254
0
  if (page)
255
0
  {
256
0
    drop_run(ctx, page->first_block);
257
0
    fz_drop_pool(ctx, page->pool);
258
0
  }
259
0
}
260
261
/*
262
 * This adds a new block at the end of the page. This should not be used
263
 * to add 'struct' blocks to the page as those have to be added internally,
264
 * with more complicated pointer setup.
265
 */
266
static fz_stext_block *
267
add_block_to_page(fz_context *ctx, fz_stext_page *page)
268
0
{
269
0
  fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
270
0
  block->bbox = fz_empty_rect; /* Fixes bug 703267. */
271
0
  block->prev = page->last_block;
272
0
  if (page->last_struct)
273
0
  {
274
0
    if (page->last_struct->last_block)
275
0
    {
276
0
      block->prev = page->last_struct->last_block;
277
0
      block->prev->next = block;
278
0
      page->last_struct->last_block = block;
279
0
    }
280
0
    else
281
0
      page->last_struct->last_block = page->last_struct->first_block = block;
282
0
  }
283
0
  else if (!page->last_block)
284
0
  {
285
0
    page->last_block = block;
286
0
    if (!page->first_block)
287
0
      page->first_block = block;
288
0
  }
289
0
  else
290
0
  {
291
0
    page->last_block->next = block;
292
0
    page->last_block = block;
293
0
  }
294
0
  return block;
295
0
}
296
297
static fz_stext_block *
298
add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
299
0
{
300
0
  fz_stext_block *block = add_block_to_page(ctx, page);
301
0
  block->type = FZ_STEXT_BLOCK_TEXT;
302
0
  return block;
303
0
}
304
305
static fz_stext_block *
306
add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image)
307
0
{
308
0
  fz_stext_block *block = add_block_to_page(ctx, page);
309
0
  block->type = FZ_STEXT_BLOCK_IMAGE;
310
0
  block->u.i.transform = ctm;
311
0
  block->u.i.image = fz_keep_image(ctx, image);
312
0
  block->bbox = fz_transform_rect(fz_unit_rect, ctm);
313
0
  return block;
314
0
}
315
316
static fz_stext_line *
317
add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi)
318
0
{
319
0
  fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
320
0
  line->prev = block->u.t.last_line;
321
0
  if (!block->u.t.first_line)
322
0
    block->u.t.first_line = block->u.t.last_line = line;
323
0
  else
324
0
  {
325
0
    block->u.t.last_line->next = line;
326
0
    block->u.t.last_line = line;
327
0
  }
328
329
0
  line->dir = *dir;
330
0
  line->wmode = wmode;
331
332
0
  return line;
333
0
}
334
335
0
#define NON_ACCURATE_GLYPH_ADDED_SPACE (-2)
336
0
#define NON_ACCURATE_GLYPH (-1)
337
338
static fz_stext_char *
339
add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags)
340
0
{
341
0
  fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
342
0
  fz_point a, d;
343
344
0
  if (!line->first_char)
345
0
    line->first_char = line->last_char = ch;
346
0
  else
347
0
  {
348
0
    line->last_char->next = ch;
349
0
    line->last_char = ch;
350
0
  }
351
352
0
  ch->c = c;
353
0
  ch->argb = color;
354
0
  ch->bidi = bidi;
355
0
  ch->origin = *p;
356
0
  ch->size = size;
357
0
  ch->font = fz_keep_font(ctx, font);
358
0
  ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0);
359
0
  if (font->flags.is_bold)
360
0
    ch->flags |= FZ_STEXT_BOLD;
361
362
0
  if (line->wmode == 0)
363
0
  {
364
0
    a.x = 0;
365
0
    d.x = 0;
366
0
    if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE)
367
0
    {
368
      /* Added space, in accurate mode. */
369
0
      a.y = d.y = 0;
370
0
    }
371
0
    else if (glyph == NON_ACCURATE_GLYPH)
372
0
    {
373
      /* Non accurate mode. */
374
0
      a.y = fz_font_ascender(ctx, font);
375
0
      d.y = fz_font_descender(ctx, font);
376
0
    }
377
0
    else
378
0
    {
379
      /* Any glyph in accurate mode */
380
0
      fz_rect bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
381
0
      a.y = bounds.y1;
382
0
      d.y = bounds.y0;
383
0
    }
384
0
  }
385
0
  else
386
0
  {
387
0
    a.x = 1;
388
0
    d.x = 0;
389
0
    a.y = 0;
390
0
    d.y = 0;
391
0
  }
392
0
  a = fz_transform_vector(a, trm);
393
0
  d = fz_transform_vector(d, trm);
394
395
0
  ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
396
0
  ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
397
0
  ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
398
0
  ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
399
400
0
  return ch;
401
0
}
402
403
static void
404
remove_last_char(fz_context *ctx, fz_stext_line *line)
405
0
{
406
0
  if (line && line->first_char)
407
0
  {
408
0
    fz_stext_char *prev = NULL;
409
0
    fz_stext_char *ch = line->first_char;
410
0
    while (ch->next)
411
0
    {
412
0
      prev = ch;
413
0
      ch = ch->next;
414
0
    }
415
0
    if (prev)
416
0
    {
417
      /* The characters are pool allocated, so we don't actually leak the removed node. */
418
      /* We do need to drop the char's font reference though. */
419
0
      fz_drop_font(ctx, prev->next->font);
420
0
      line->last_char = prev;
421
0
      line->last_char->next = NULL;
422
0
    }
423
0
  }
424
0
}
425
426
static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail)
427
0
{
428
0
  fz_stext_char *prev, *next;
429
0
  prev = tail;
430
0
  while (curr != tail)
431
0
  {
432
0
    next = curr->next;
433
0
    curr->next = prev;
434
0
    prev = curr;
435
0
    curr = next;
436
0
  }
437
0
  return prev;
438
0
}
439
440
static void reverse_bidi_line(fz_stext_line *line)
441
0
{
442
0
  fz_stext_char *a, *b, **prev;
443
0
  prev = &line->first_char;
444
0
  for (a = line->first_char; a; a = a->next)
445
0
  {
446
0
    if (a->bidi)
447
0
    {
448
0
      b = a;
449
0
      while (b->next && b->next->bidi)
450
0
        b = b->next;
451
0
      if (a != b)
452
0
        *prev = reverse_bidi_span(a, b->next);
453
0
    }
454
0
    prev = &a->next;
455
0
    line->last_char = a;
456
0
  }
457
0
}
458
459
static int is_hyphen(int c)
460
0
{
461
  /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
462
0
  return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
463
0
}
464
465
static float
466
vec_dot(const fz_point *a, const fz_point *b)
467
0
{
468
0
  return a->x * b->x + a->y * b->y;
469
0
}
470
471
static int may_add_space(int lastchar)
472
0
{
473
  /* Basic latin, greek, cyrillic, hebrew, arabic,
474
   * general punctuation,
475
   * superscripts and subscripts,
476
   * and currency symbols.
477
   */
478
0
  return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF)));
479
0
}
480
481
0
#define FAKEBOLD_THRESHOLD_RECIP 10
482
483
static int
484
close(float a, float b, float size)
485
0
{
486
0
  a -= b;
487
0
  if (a < 0)
488
0
    a = -a;
489
490
0
  return FAKEBOLD_THRESHOLD_RECIP * a < size;
491
0
}
492
493
static int
494
font_equiv(fz_context *ctx, fz_font *f, fz_font *g)
495
0
{
496
0
  unsigned char fdigest[16];
497
0
  unsigned char gdigest[16];
498
499
0
  if (f == g)
500
0
    return 1;
501
502
0
  if (strcmp(f->name, g->name) != 0)
503
0
    return 0;
504
505
0
  fz_font_digest(ctx, f, fdigest);
506
0
  fz_font_digest(ctx, g, gdigest);
507
508
0
  return (memcmp(fdigest, gdigest, 16) == 0);
509
0
}
510
511
static int
512
check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags)
513
0
{
514
0
  fz_stext_line *line;
515
0
  fz_stext_char *ch;
516
517
0
  for (; block != NULL; block = block->next)
518
0
  {
519
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
520
0
    {
521
0
      if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags))
522
0
        return 1;
523
0
    }
524
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
525
0
    {
526
0
      for (line = block->u.t.first_line; line != NULL; line = line->next)
527
0
      {
528
0
        fz_stext_char *pr = NULL;
529
0
        for (ch = line->first_char; ch != NULL; ch = ch->next)
530
0
        {
531
          /* Not perfect, but it'll do! */
532
0
          if (ch->c == c && close(ch->origin.x, p.x, size) && close(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font))
533
0
          {
534
            /* If we were filled before, and we are stroking now... */
535
0
            if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED &&
536
0
              (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED)
537
0
            {
538
              /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */
539
0
              ch->flags |= flags;
540
0
              return 1;
541
0
            }
542
            /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these
543
             * as boldening if either the char before, or the char after were also boldened. */
544
0
            ch->flags |= flags;
545
546
0
            if (c == ' ')
547
0
            {
548
0
              if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) ||
549
0
                (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0))
550
0
              {
551
                /* OK, we can be bold. */
552
0
                ch->flags |= FZ_STEXT_BOLD;
553
0
                return 1;
554
0
              }
555
              /* Ignore this and keep going */
556
0
            }
557
0
            else
558
0
            {
559
0
              ch->flags |= FZ_STEXT_BOLD;
560
0
              return 1;
561
0
            }
562
0
          }
563
0
          pr = ch;
564
0
        }
565
0
      }
566
0
    }
567
0
  }
568
569
0
  return 0;
570
0
}
571
572
static void
573
fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags)
574
0
{
575
0
  fz_stext_page *page = dev->page;
576
0
  fz_stext_block *cur_block;
577
0
  fz_stext_line *cur_line;
578
579
0
  int new_para = 0;
580
0
  int new_line = 1;
581
0
  int add_space = 0;
582
0
  fz_point dir, ndir, p, q;
583
0
  float size;
584
0
  fz_point delta;
585
0
  float spacing = 0;
586
0
  float base_offset = 0;
587
0
  float dist;
588
589
  /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */
590
0
  bidi = bidi & 1;
591
592
  /* dir = direction vector for motion. ndir = normalised(dir) */
593
0
  if (wmode == 0)
594
0
  {
595
0
    dir.x = 1;
596
0
    dir.y = 0;
597
0
  }
598
0
  else
599
0
  {
600
0
    dir.x = 0;
601
0
    dir.y = -1;
602
0
  }
603
0
  dir = fz_transform_vector(dir, trm);
604
0
  ndir = fz_normalize_vector(dir);
605
606
0
  size = fz_matrix_expansion(trm);
607
608
  /* We need to identify where glyphs 'start' (p) and 'stop' (q).
609
   * Each glyph holds its 'start' position, and the next glyph in the
610
   * span (or span->max if there is no next glyph) holds its 'end'
611
   * position.
612
   *
613
   * For both horizontal and vertical motion, trm->{e,f} gives the
614
   * origin (usually the bottom left) of the glyph.
615
   *
616
   * In horizontal mode:
617
   *   + p is bottom left.
618
   *   + q is the bottom right
619
   * In vertical mode:
620
   *   + p is top left (where it advanced from)
621
   *   + q is bottom left
622
   */
623
0
  if (wmode == 0)
624
0
  {
625
0
    p.x = trm.e;
626
0
    p.y = trm.f;
627
0
    q.x = trm.e + adv * dir.x;
628
0
    q.y = trm.f + adv * dir.y;
629
0
  }
630
0
  else
631
0
  {
632
0
    p.x = trm.e - adv * dir.x;
633
0
    p.y = trm.f - adv * dir.y;
634
0
    q.x = trm.e;
635
0
    q.y = trm.f;
636
0
  }
637
638
0
  if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0)
639
0
  {
640
0
    if (glyph == -1)
641
0
    {
642
0
      if (dev->last_was_fake_bold)
643
0
        goto move_pen_and_exit;
644
0
    }
645
0
    else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags))
646
0
    {
647
0
      dev->last_was_fake_bold = 1;
648
0
      goto move_pen_and_exit;
649
0
    }
650
0
    dev->last_was_fake_bold = 0;
651
0
  }
652
653
  /* Find current position to enter new text. */
654
0
  cur_block = page->last_struct ? page->last_struct->last_block : page->last_block;
655
0
  if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
656
0
    cur_block = NULL;
657
0
  cur_line = cur_block ? cur_block->u.t.last_line : NULL;
658
659
0
  if (cur_line && glyph < 0)
660
0
  {
661
    /* Don't advance pen or break lines for no-glyph characters in a cluster */
662
0
    add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags);
663
0
    dev->lastbidi = bidi;
664
0
    dev->lastchar = c;
665
0
    return;
666
0
  }
667
668
0
  if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
669
0
  {
670
    /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
671
     * then we can't append to the current block/line. */
672
0
    new_para = 1;
673
0
    new_line = 1;
674
0
  }
675
0
  else
676
0
  {
677
    /* Detect fake bold where text is printed twice in the same place. */
678
    /* Largely supplanted by the check_for_fake_bold mechanism above,
679
     * but we leave this in for backward compatibility as it's cheap,
680
     * and works even when FZ_STEXT_COLLECT_STYLES is not set. */
681
0
    dist = hypotf(q.x - dev->pen.x, q.y - dev->pen.y) / size;
682
0
    if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar)
683
0
      return;
684
685
    /* Calculate how far we've moved since the last character. */
686
0
    delta.x = p.x - dev->pen.x;
687
0
    delta.y = p.y - dev->pen.y;
688
689
    /* The transform has not changed, so we know we're in the same
690
     * direction. Calculate 2 distances; how far off the previous
691
     * baseline we are, together with how far along the baseline
692
     * we are from the expected position. */
693
0
    spacing = (ndir.x * delta.x + ndir.y * delta.y) / size;
694
0
    base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size;
695
696
    /* Only a small amount off the baseline - we'll take this */
697
0
    if (fabsf(base_offset) < BASE_MAX_DIST)
698
0
    {
699
      /* If mixed LTR and RTL content */
700
0
      if ((bidi & 1) != (dev->lastbidi & 1))
701
0
      {
702
        /* Ignore jumps within line when switching between LTR and RTL text. */
703
0
        new_line = 0;
704
0
      }
705
706
      /* RTL */
707
0
      else if (bidi & 1)
708
0
      {
709
0
        fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y);
710
0
        float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv;
711
712
        /* If the pen is where we would have been if we
713
         * had advanced backwards from the previous
714
         * character by this character's advance, we
715
         * are probably seeing characters emitted in
716
         * logical order.
717
         */
718
0
        if (fabsf(logical_spacing) < SPACE_DIST)
719
0
        {
720
0
          new_line = 0;
721
0
        }
722
723
        /* However, if the pen has advanced to where we would expect it
724
         * in an LTR context, we're seeing them emitted in visual order
725
         * and should flag them for reordering!
726
         */
727
0
        else if (fabsf(spacing) < SPACE_DIST)
728
0
        {
729
0
          bidi = 3; /* mark line as visual */
730
0
          new_line = 0;
731
0
        }
732
733
        /* And any other small jump could be a missing space. */
734
0
        else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST)
735
0
        {
736
0
          if (wmode == 0 && may_add_space(dev->lastchar))
737
0
            add_space = 1;
738
0
          new_line = 0;
739
0
        }
740
0
        else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
741
0
        {
742
          /* Motion is in line, but negative. We've probably got overlapping
743
           * chars here. Live with it. */
744
0
          new_line = 0;
745
0
        }
746
0
        else if (spacing > 0 && spacing < SPACE_MAX_DIST)
747
0
        {
748
0
          bidi = 3; /* mark line as visual */
749
0
          if (wmode == 0 && may_add_space(dev->lastchar))
750
0
            add_space = 1;
751
0
          new_line = 0;
752
0
        }
753
754
0
        else
755
0
        {
756
          /* Motion is large and unexpected (probably a new table column). */
757
0
          new_line = 1;
758
0
        }
759
0
      }
760
761
      /* LTR or neutral character */
762
0
      else
763
0
      {
764
0
        if (fabsf(spacing) < SPACE_DIST)
765
0
        {
766
          /* Motion is in line and small enough to ignore. */
767
0
          new_line = 0;
768
0
        }
769
0
        else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
770
0
        {
771
          /* Motion is in line, but negative. We've probably got overlapping
772
           * chars here. Live with it. */
773
0
          new_line = 0;
774
0
        }
775
0
        else if (spacing > 0 && spacing < SPACE_MAX_DIST)
776
0
        {
777
          /* Motion is forward in line and large enough to warrant us adding a space. */
778
0
          if (wmode == 0 && may_add_space(dev->lastchar))
779
0
            add_space = 1;
780
0
          new_line = 0;
781
0
        }
782
0
        else
783
0
        {
784
          /* Motion is large and unexpected (probably a new table column). */
785
0
          new_line = 1;
786
0
        }
787
0
      }
788
0
    }
789
790
    /* Enough for a new line, but not enough for a new paragraph */
791
0
    else if (fabsf(base_offset) <= PARAGRAPH_DIST)
792
0
    {
793
      /* Check indent to spot text-indent style paragraphs */
794
0
      if (wmode == 0 && cur_line && dev->new_obj)
795
0
        if ((p.x - dev->start.x) > 0.5f)
796
0
          new_para = 1;
797
0
      new_line = 1;
798
0
    }
799
800
    /* Way off the baseline - open a new paragraph */
801
0
    else
802
0
    {
803
0
      new_para = 1;
804
0
      new_line = 1;
805
0
    }
806
0
  }
807
808
  /* Start a new block (but only at the beginning of a text object) */
809
0
  if (new_para || !cur_block)
810
0
  {
811
0
    cur_block = add_text_block_to_page(ctx, page);
812
0
    cur_line = cur_block->u.t.last_line;
813
0
  }
814
815
0
  if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && is_hyphen(dev->lastchar))
816
0
  {
817
0
    remove_last_char(ctx, cur_line);
818
0
    new_line = 0;
819
0
  }
820
821
  /* Start a new line */
822
0
  if (new_line || !cur_line || force_new_line)
823
0
  {
824
0
    cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi);
825
0
    dev->start = p;
826
0
  }
827
828
  /* Add synthetic space */
829
0
  if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
830
0
    add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, 1, flags);
831
832
0
  add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags);
833
834
0
move_pen_and_exit:
835
0
  dev->lastchar = c;
836
0
  dev->lastbidi = bidi;
837
0
  dev->lag_pen = p;
838
0
  dev->pen = q;
839
840
0
  dev->new_obj = 0;
841
0
  dev->trm = trm;
842
0
}
843
844
static void
845
fz_add_stext_char(fz_context *ctx,
846
  fz_stext_device *dev,
847
  fz_font *font,
848
  int c,
849
  int glyph,
850
  fz_matrix trm,
851
  float adv,
852
  int wmode,
853
  int bidi,
854
  int force_new_line,
855
  int flags)
856
0
{
857
  /* ignore when one unicode character maps to multiple glyphs */
858
0
  if (c == -1)
859
0
    return;
860
861
0
  if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
862
0
  {
863
0
    switch (c)
864
0
    {
865
0
    case 0xFB00: /* ff */
866
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
867
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
868
0
      return;
869
0
    case 0xFB01: /* fi */
870
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
871
0
      fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
872
0
      return;
873
0
    case 0xFB02: /* fl */
874
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
875
0
      fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
876
0
      return;
877
0
    case 0xFB03: /* ffi */
878
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
879
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
880
0
      fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
881
0
      return;
882
0
    case 0xFB04: /* ffl */
883
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
884
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
885
0
      fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
886
0
      return;
887
0
    case 0xFB05: /* long st */
888
0
    case 0xFB06: /* st */
889
0
      fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags);
890
0
      fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags);
891
0
      return;
892
0
    }
893
0
  }
894
895
0
  if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
896
0
  {
897
0
    switch (c)
898
0
    {
899
0
    case 0x0009: /* tab */
900
0
    case 0x0020: /* space */
901
0
    case 0x00A0: /* no-break space */
902
0
    case 0x1680: /* ogham space mark */
903
0
    case 0x180E: /* mongolian vowel separator */
904
0
    case 0x2000: /* en quad */
905
0
    case 0x2001: /* em quad */
906
0
    case 0x2002: /* en space */
907
0
    case 0x2003: /* em space */
908
0
    case 0x2004: /* three-per-em space */
909
0
    case 0x2005: /* four-per-em space */
910
0
    case 0x2006: /* six-per-em space */
911
0
    case 0x2007: /* figure space */
912
0
    case 0x2008: /* punctuation space */
913
0
    case 0x2009: /* thin space */
914
0
    case 0x200A: /* hair space */
915
0
    case 0x202F: /* narrow no-break space */
916
0
    case 0x205F: /* medium mathematical space */
917
0
    case 0x3000: /* ideographic space */
918
0
      c = ' ';
919
0
    }
920
0
  }
921
922
0
  fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags);
923
0
}
924
925
static fz_rect
926
current_clip(fz_context *ctx, fz_stext_device *dev)
927
0
{
928
0
  fz_rect r = fz_infinite_rect;
929
930
0
  if (dev->flags & FZ_STEXT_CLIP)
931
0
  {
932
0
    r = fz_device_current_scissor(ctx, &dev->super);
933
0
    r = fz_intersect_rect(r, dev->page->mediabox);
934
0
  }
935
0
  if (dev->flags & FZ_STEXT_CLIP_RECT)
936
0
    r = fz_intersect_rect(r, dev->opts.clip);
937
938
0
  return r;
939
0
}
940
941
static void
942
do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags)
943
0
{
944
0
  fz_font *font = span->font;
945
0
  fz_matrix tm = span->trm;
946
0
  float adv;
947
0
  int unicode;
948
0
  int i;
949
950
0
  for (i = start; i < end; i++)
951
0
  {
952
    /* Calculate new pen location and delta */
953
0
    tm.e = span->items[i].x;
954
0
    tm.f = span->items[i].y;
955
0
    dev->last.trm = fz_concat(tm, ctm);
956
0
    dev->last.bidi_level = span->bidi_level;
957
0
    dev->last.wmode = span->wmode;
958
0
    if (font != dev->last.font)
959
0
    {
960
0
      fz_drop_font(ctx, dev->last.font);
961
0
      dev->last.font = fz_keep_font(ctx, font);
962
0
    }
963
0
    dev->last.valid = 1;
964
0
    dev->last.flags = flags;
965
966
0
    if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
967
0
    {
968
0
      fz_rect r = current_clip(ctx, dev);
969
0
      if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
970
0
      {
971
0
        dev->last.clipped = 1;
972
0
        continue;
973
0
      }
974
0
    }
975
0
    dev->last.clipped = 0;
976
977
    /* Calculate bounding box and new pen position based on font metrics */
978
0
    if (span->items[i].gid >= 0)
979
0
      adv = span->items[i].adv;
980
0
    else
981
0
      adv = 0;
982
983
0
    unicode = span->items[i].ucs;
984
0
    if (unicode == FZ_REPLACEMENT_CHARACTER)
985
0
    {
986
0
      if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE)
987
0
      {
988
0
        unicode = span->items[i].cid;
989
0
        flags |= FZ_STEXT_UNICODE_IS_CID;
990
0
      }
991
0
      else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE)
992
0
      {
993
0
        unicode = span->items[i].gid;
994
0
        flags |= FZ_STEXT_UNICODE_IS_GID;
995
0
      }
996
0
    }
997
998
    /* Send the chars we have through. */
999
0
    fz_add_stext_char(ctx, dev, font,
1000
0
      unicode,
1001
0
      span->items[i].gid,
1002
0
      dev->last.trm,
1003
0
      adv,
1004
0
      dev->last.wmode,
1005
0
      dev->last.bidi_level,
1006
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1007
0
      flags);
1008
0
  }
1009
0
}
1010
1011
static int
1012
rune_index(const char *utf8, size_t idx)
1013
0
{
1014
0
  int rune;
1015
1016
0
  do
1017
0
  {
1018
0
    int len = fz_chartorune(&rune, utf8);
1019
0
    if (rune == 0)
1020
0
      return -1;
1021
0
    utf8 += len;
1022
0
  }
1023
0
  while (idx--);
1024
1025
0
  return rune;
1026
0
}
1027
1028
static void
1029
flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i)
1030
0
{
1031
0
  if (*actualtext == 0)
1032
0
    return;
1033
1034
0
  while (1)
1035
0
  {
1036
0
    int rune;
1037
0
    actualtext += fz_chartorune(&rune, actualtext);
1038
1039
0
    if (rune == 0)
1040
0
      break;
1041
1042
0
    if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1043
0
      if (dev->last.clipped)
1044
0
        continue;
1045
1046
0
    fz_add_stext_char(ctx, dev, dev->last.font,
1047
0
      rune,
1048
0
      -1,
1049
0
      dev->last.trm,
1050
0
      0,
1051
0
      dev->last.wmode,
1052
0
      dev->last.bidi_level,
1053
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1054
0
      dev->last.flags);
1055
0
    i++;
1056
0
  }
1057
0
}
1058
1059
static void
1060
do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags)
1061
0
{
1062
  /* We are within an actualtext block. This means we can't just add the chars
1063
   * as they are. We need to add the chars as they are meant to be. Sadly the
1064
   * actualtext mechanism doesn't help us at all with positioning. */
1065
0
  fz_font *font = span->font;
1066
0
  fz_matrix tm = span->trm;
1067
0
  float adv;
1068
0
  int start, i, end;
1069
0
  char *actualtext = mt->text;
1070
0
  size_t z = fz_utflen(actualtext);
1071
1072
  /* If actualtext is empty, nothing to do! */
1073
0
  if (z == 0)
1074
0
    return;
1075
1076
  /* Now, we HOPE that the creator of a PDF will minimise the actual text
1077
   * differences, so that we'll get:
1078
   *   "Politicians <Actualtext="lie">fib</ActualText>, always."
1079
   * rather than:
1080
   *   "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText>
1081
   * but experience with PDF files tells us that this won't always be the case.
1082
   *
1083
   * We try to minimise the actualtext section here, just in case.
1084
   */
1085
1086
  /* Spot a matching prefix and send it. */
1087
0
  for (start = 0; start < span->len; start++)
1088
0
  {
1089
0
    int rune;
1090
0
    int len = fz_chartorune(&rune, actualtext);
1091
0
    if (span->items[start].gid != rune || rune == 0)
1092
0
      break;
1093
0
    actualtext += len; z--;
1094
0
  }
1095
0
  if (start != 0)
1096
0
    do_extract(ctx, dev, span, ctm, 0, start, flags);
1097
1098
0
  if (start == span->len)
1099
0
  {
1100
    /* The prefix has consumed all this object. Just shorten the actualtext and we'll
1101
     * catch the rest next time. */
1102
0
    z = strlen(actualtext)+1;
1103
0
    memmove(mt->text, actualtext, z);
1104
0
    return;
1105
0
  }
1106
1107
  /* Spot a matching postfix. Can't send it til the end. */
1108
0
  for (end = span->len; end > start; end--)
1109
0
  {
1110
    /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */
1111
0
    int rune = rune_index(actualtext, z-1);
1112
0
    if (span->items[end-1].gid != rune)
1113
0
      break;
1114
0
    z--;
1115
0
  }
1116
  /* So we can send end -> span->len at the end. */
1117
1118
  /* So we have at least SOME chars that don't match. */
1119
  /* Now, do the difficult bit in the middle.*/
1120
  /* items[start..end] have to be sent with actualtext[start..z] */
1121
0
  for (i = start; i < end; i++)
1122
0
  {
1123
0
    fz_text_item *item = &span->items[i];
1124
0
    int rune = -1;
1125
1126
0
    if ((size_t)i < z)
1127
0
      actualtext += fz_chartorune(&rune, actualtext);
1128
1129
    /* Calculate new pen location and delta */
1130
0
    tm.e = item->x;
1131
0
    tm.f = item->y;
1132
0
    dev->last.trm = fz_concat(tm, ctm);
1133
0
    dev->last.bidi_level = span->bidi_level;
1134
0
    dev->last.wmode = span->wmode;
1135
0
    if (font != dev->last.font)
1136
0
    {
1137
0
      fz_drop_font(ctx, dev->last.font);
1138
0
      dev->last.font = fz_keep_font(ctx, font);
1139
0
    }
1140
0
    dev->last.valid = 1;
1141
1142
0
    if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1143
0
    {
1144
0
      fz_rect r = current_clip(ctx, dev);
1145
0
      if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
1146
0
      {
1147
0
        dev->last.clipped = 1;
1148
0
        continue;
1149
0
      }
1150
0
    }
1151
0
    dev->last.clipped = 0;
1152
1153
    /* Calculate bounding box and new pen position based on font metrics */
1154
0
    if (item->gid >= 0)
1155
0
      adv = item->adv;
1156
0
    else
1157
0
      adv = 0;
1158
1159
0
    fz_add_stext_char(ctx, dev, font,
1160
0
      rune,
1161
0
      span->items[i].gid,
1162
0
      dev->last.trm,
1163
0
      adv,
1164
0
      dev->last.wmode,
1165
0
      dev->last.bidi_level,
1166
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1167
0
      flags);
1168
0
  }
1169
1170
  /* If we haven't spotted a postfix by this point, then don't force ourselves to output
1171
   * any more of the actualtext at this point. We might get a new text object that matches
1172
   * more of it. */
1173
0
  if (end == span->len)
1174
0
  {
1175
    /* Shorten actualtext and exit. */
1176
0
    z = strlen(actualtext)+1;
1177
0
    memmove(mt->text, actualtext, z);
1178
0
    return;
1179
0
  }
1180
1181
  /* We found a matching postfix. It seems likely that this is going to be the only
1182
   * text object we get, so send any remaining actualtext now. */
1183
0
  flush_actualtext(ctx, dev, actualtext, i);
1184
1185
  /* Send the postfix */
1186
0
  if (end != span->len)
1187
0
    do_extract(ctx, dev, span, ctm, end, span->len, flags);
1188
1189
0
  mt->text[0] = 0;
1190
0
}
1191
1192
static void
1193
fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags)
1194
0
{
1195
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1196
0
  metatext_t *mt = NULL;
1197
1198
0
  if (span->len == 0)
1199
0
    return;
1200
1201
  /* Are we in an actualtext? */
1202
0
  if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT))
1203
0
    mt = find_actualtext(dev);
1204
1205
0
  if (mt)
1206
0
    do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags);
1207
0
  else
1208
0
    do_extract(ctx, dev, span, ctm, 0, span->len, flags);
1209
0
}
1210
1211
static int hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha)
1212
0
{
1213
0
  float rgb[3];
1214
0
  fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
1215
0
  return
1216
0
    (fz_clampi(alpha * 255 + 0.5f, 0, 255) << 24) |
1217
0
    (fz_clampi(rgb[0] * 255 + 0.5f, 0, 255) << 16) |
1218
0
    (fz_clampi(rgb[1] * 255 + 0.5f, 0, 255) << 8) |
1219
0
    (fz_clampi(rgb[2] * 255 + 0.5f, 0, 255));
1220
0
}
1221
1222
static void
1223
fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
1224
  fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1225
0
{
1226
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1227
0
  fz_text_span *span;
1228
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1229
0
    return;
1230
0
  tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1231
0
  tdev->new_obj = 1;
1232
0
  for (span = text->head; span; span = span->next)
1233
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED);
1234
0
  fz_drop_text(ctx, tdev->lasttext);
1235
0
  tdev->lasttext = fz_keep_text(ctx, text);
1236
0
}
1237
1238
static void
1239
fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
1240
  fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1241
0
{
1242
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1243
0
  fz_text_span *span;
1244
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1245
0
    return;
1246
0
  tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1247
0
  tdev->new_obj = 1;
1248
0
  for (span = text->head; span; span = span->next)
1249
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED);
1250
0
  fz_drop_text(ctx, tdev->lasttext);
1251
0
  tdev->lasttext = fz_keep_text(ctx, text);
1252
0
}
1253
1254
static void
1255
fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
1256
0
{
1257
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1258
0
  fz_text_span *span;
1259
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1260
0
    return;
1261
0
  tdev->color = 0;
1262
0
  tdev->new_obj = 1;
1263
0
  for (span = text->head; span; span = span->next)
1264
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED);
1265
0
  fz_drop_text(ctx, tdev->lasttext);
1266
0
  tdev->lasttext = fz_keep_text(ctx, text);
1267
0
}
1268
1269
static void
1270
fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
1271
0
{
1272
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1273
0
  fz_text_span *span;
1274
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1275
0
    return;
1276
0
  tdev->color = 0;
1277
0
  tdev->new_obj = 1;
1278
0
  for (span = text->head; span; span = span->next)
1279
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED);
1280
0
  fz_drop_text(ctx, tdev->lasttext);
1281
0
  tdev->lasttext = fz_keep_text(ctx, text);
1282
0
}
1283
1284
static void
1285
fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
1286
0
{
1287
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1288
0
  fz_text_span *span;
1289
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1290
0
    return;
1291
0
  tdev->color = 0;
1292
0
  tdev->new_obj = 1;
1293
0
  for (span = text->head; span; span = span->next)
1294
0
    fz_stext_extract(ctx, tdev, span, ctm, 0);
1295
0
  fz_drop_text(ctx, tdev->lasttext);
1296
0
  tdev->lasttext = fz_keep_text(ctx, text);
1297
0
}
1298
1299
static void
1300
fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text)
1301
0
{
1302
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1303
0
  metatext_t *mt = fz_malloc_struct(ctx, metatext_t);
1304
1305
0
  mt->prev = tdev->metatext;
1306
0
  tdev->metatext = mt;
1307
0
  mt->type = meta;
1308
0
  mt->text = text ? fz_strdup(ctx, text) : NULL;
1309
0
  mt->bounds = fz_empty_rect;
1310
0
}
1311
1312
static void
1313
pop_metatext(fz_context *ctx, fz_stext_device *dev)
1314
0
{
1315
0
  metatext_t *prev;
1316
0
  fz_rect bounds;
1317
1318
0
  if (!dev->metatext)
1319
0
    return;
1320
1321
0
  prev = dev->metatext->prev;
1322
0
  bounds = dev->metatext->bounds;
1323
0
  fz_free(ctx, dev->metatext->text);
1324
0
  fz_free(ctx, dev->metatext);
1325
0
  dev->metatext = prev;
1326
0
  if (prev)
1327
0
    prev->bounds = fz_union_rect(prev->bounds, bounds);
1328
0
}
1329
1330
static void
1331
fz_stext_end_metatext(fz_context *ctx, fz_device *dev)
1332
0
{
1333
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1334
0
  fz_font *myfont = NULL;
1335
1336
0
  if (!tdev->metatext)
1337
0
    return; /* Mismatched pop. Live with it. */
1338
1339
0
  if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT)
1340
0
  {
1341
    /* We only deal with ActualText here. Just pop anything else off,
1342
     * and we're done. */
1343
0
    pop_metatext(ctx, tdev);
1344
0
    return;
1345
0
  }
1346
1347
  /* If we have a 'last' text position, send the content after that. */
1348
0
  if (tdev->last.valid)
1349
0
  {
1350
0
    flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
1351
0
    pop_metatext(ctx, tdev);
1352
0
    return;
1353
0
  }
1354
1355
  /* If we have collected a rectangle for content that encloses the actual text,
1356
   * send the content there. */
1357
0
  if (!fz_is_empty_rect(tdev->metatext->bounds))
1358
0
  {
1359
0
    tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0;
1360
0
    tdev->last.trm.b = 0;
1361
0
    tdev->last.trm.c = 0;
1362
0
    tdev->last.trm.d = tdev->metatext->bounds.y1 - tdev->metatext->bounds.y0;
1363
0
    tdev->last.trm.e = tdev->metatext->bounds.x0;
1364
0
    tdev->last.trm.f = tdev->metatext->bounds.y0;
1365
0
  }
1366
0
  else
1367
0
    fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned.");
1368
1369
0
  fz_var(myfont);
1370
1371
0
  fz_try(ctx)
1372
0
  {
1373
0
    if (tdev->last.font == NULL)
1374
0
    {
1375
0
      myfont = fz_new_base14_font(ctx, "Helvetica");
1376
0
      tdev->last.font = myfont;
1377
0
    }
1378
0
    flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
1379
0
    pop_metatext(ctx, tdev);
1380
0
  }
1381
0
  fz_always(ctx)
1382
0
  {
1383
0
    if (myfont)
1384
0
    {
1385
0
      tdev->last.font = NULL;
1386
0
      fz_drop_font(ctx, myfont);
1387
0
    }
1388
0
  }
1389
0
  fz_catch(ctx)
1390
0
    fz_rethrow(ctx);
1391
0
}
1392
1393
1394
/* Images and shadings */
1395
1396
static void
1397
fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
1398
0
{
1399
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1400
0
  fz_rect *bounds = actualtext_bounds(tdev);
1401
1402
  /* If there is an actualtext in force, update its bounds. */
1403
0
  if (bounds)
1404
0
  {
1405
0
    static const fz_rect unit = { 0, 0, 1, 1 };
1406
0
    *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm));
1407
0
  }
1408
1409
  /* Unless we are being told to preserve images, nothing to do here. */
1410
0
  if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1411
0
    return;
1412
1413
  /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
1414
0
  if (alpha >= 0.5f)
1415
0
    add_image_block_to_page(ctx, tdev->page, ctm, img);
1416
1417
0
}
1418
1419
static void
1420
fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
1421
    fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
1422
0
{
1423
0
  fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
1424
0
}
1425
1426
static fz_image *
1427
fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
1428
0
{
1429
0
  fz_matrix ctm = *in_out_ctm;
1430
0
  fz_pixmap *pix;
1431
0
  fz_image *img = NULL;
1432
0
  fz_rect bounds;
1433
0
  fz_irect bbox;
1434
1435
0
  bounds = fz_bound_shade(ctx, shade, ctm);
1436
0
  bounds = fz_intersect_rect(bounds, scissor);
1437
0
  bbox = fz_irect_from_rect(bounds);
1438
1439
0
  pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
1440
0
  fz_try(ctx)
1441
0
  {
1442
0
    if (shade->use_background)
1443
0
      fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
1444
0
    else
1445
0
      fz_clear_pixmap(ctx, pix);
1446
0
    fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL);
1447
0
    img = fz_new_image_from_pixmap(ctx, pix, NULL);
1448
0
  }
1449
0
  fz_always(ctx)
1450
0
    fz_drop_pixmap(ctx, pix);
1451
0
  fz_catch(ctx)
1452
0
    fz_rethrow(ctx);
1453
1454
0
  in_out_ctm->a = pix->w;
1455
0
  in_out_ctm->b = 0;
1456
0
  in_out_ctm->c = 0;
1457
0
  in_out_ctm->d = pix->h;
1458
0
  in_out_ctm->e = pix->x;
1459
0
  in_out_ctm->f = pix->y;
1460
0
  return img;
1461
0
}
1462
1463
static void
1464
fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
1465
0
{
1466
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1467
0
  fz_rect *bounds = actualtext_bounds(tdev);
1468
0
  fz_matrix local_ctm;
1469
0
  fz_rect scissor;
1470
0
  fz_image *image;
1471
1472
  /* If we aren't keeping images, but we are in a bound, update the bounds
1473
   * without generating the entire image. */
1474
0
  if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && bounds)
1475
0
  {
1476
0
    *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm));
1477
0
    return;
1478
0
  }
1479
1480
  /* Unless we are preserving image, nothing to do here. */
1481
0
  if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1482
0
    return;
1483
1484
0
  local_ctm = ctm;
1485
0
  scissor = fz_device_current_scissor(ctx, dev);
1486
0
  if (dev->flags & FZ_STEXT_CLIP_RECT)
1487
0
    scissor = fz_intersect_rect(scissor, tdev->opts.clip);
1488
0
  scissor = fz_intersect_rect(scissor, tdev->page->mediabox);
1489
0
  image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
1490
0
  fz_try(ctx)
1491
0
    fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
1492
0
  fz_always(ctx)
1493
0
    fz_drop_image(ctx, image);
1494
0
  fz_catch(ctx)
1495
0
    fz_rethrow(ctx);
1496
0
}
1497
1498
static void
1499
fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block)
1500
0
{
1501
0
  fz_stext_line *line;
1502
0
  fz_stext_char *ch;
1503
1504
0
  for ( ; block != NULL; block = block->next)
1505
0
  {
1506
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
1507
0
      if (block->u.s.down)
1508
0
        fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block);
1509
0
    if (block->type != FZ_STEXT_BLOCK_TEXT)
1510
0
      continue;
1511
0
    for (line = block->u.t.first_line; line; line = line->next)
1512
0
    {
1513
0
      int reorder = 0;
1514
0
      for (ch = line->first_char; ch; ch = ch->next)
1515
0
      {
1516
0
        fz_rect ch_box = fz_rect_from_quad(ch->quad);
1517
0
        if (ch == line->first_char)
1518
0
          line->bbox = ch_box;
1519
0
        else
1520
0
          line->bbox = fz_union_rect(line->bbox, ch_box);
1521
0
        if (ch->bidi == 3)
1522
0
          reorder = 1;
1523
0
      }
1524
0
      block->bbox = fz_union_rect(block->bbox, line->bbox);
1525
0
      if (reorder)
1526
0
        reverse_bidi_line(line);
1527
0
    }
1528
0
  }
1529
0
}
1530
1531
static void
1532
fz_stext_close_device(fz_context *ctx, fz_device *dev)
1533
0
{
1534
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1535
0
  fz_stext_page *page = tdev->page;
1536
1537
0
  fixup_bboxes_and_bidi(ctx, page->first_block);
1538
1539
  /* TODO: smart sorting of blocks and lines in reading order */
1540
  /* TODO: unicode NFC normalization */
1541
1542
0
  if (tdev->opts.flags & FZ_STEXT_SEGMENT)
1543
0
    fz_segment_stext_page(ctx, page);
1544
1545
0
  if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT)
1546
0
    fz_table_hunt(ctx, page);
1547
1548
0
  if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK)
1549
0
    fz_paragraph_break(ctx, page);
1550
0
}
1551
1552
static void
1553
fz_stext_drop_device(fz_context *ctx, fz_device *dev)
1554
0
{
1555
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1556
0
  fz_drop_text(ctx, tdev->lasttext);
1557
0
  fz_drop_font(ctx, tdev->last.font);
1558
0
  while (tdev->metatext)
1559
0
    pop_metatext(ctx, tdev);
1560
0
}
1561
1562
static int
1563
val_is_rect(const char *val, fz_rect *rp)
1564
0
{
1565
0
  fz_rect r;
1566
0
  const char *s;
1567
1568
0
  s = strchr(val, ':');
1569
0
  if (s == NULL || s == val)
1570
0
    return 0;
1571
0
  r.x0 = fz_atof(val);
1572
0
  val = s+1;
1573
0
  s = strchr(val, ':');
1574
0
  if (s == NULL || s == val)
1575
0
    return 0;
1576
0
  r.y0 = fz_atof(val);
1577
0
  val = s+1;
1578
0
  s = strchr(val, ':');
1579
0
  if (s == NULL || s == val)
1580
0
    return 0;
1581
0
  r.x1 = fz_atof(val);
1582
0
  val = s+1;
1583
0
  r.y1 = fz_atof(val);
1584
1585
0
  *rp = r;
1586
1587
0
  return 1;
1588
0
}
1589
1590
fz_stext_options *
1591
fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
1592
0
{
1593
0
  const char *val;
1594
1595
0
  memset(opts, 0, sizeof *opts);
1596
1597
0
  if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
1598
0
    opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
1599
0
  if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
1600
0
    opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
1601
0
  if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes"))
1602
0
    opts->flags |= FZ_STEXT_PRESERVE_IMAGES;
1603
0
  if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes"))
1604
0
    opts->flags |= FZ_STEXT_INHIBIT_SPACES;
1605
0
  if (fz_has_option(ctx, string, "dehyphenate", &val) && fz_option_eq(val, "yes"))
1606
0
    opts->flags |= FZ_STEXT_DEHYPHENATE;
1607
0
  if (fz_has_option(ctx, string, "preserve-spans", &val) && fz_option_eq(val, "yes"))
1608
0
    opts->flags |= FZ_STEXT_PRESERVE_SPANS;
1609
0
  if (fz_has_option(ctx, string, "structured", &val) && fz_option_eq(val, "yes"))
1610
0
    opts->flags |= FZ_STEXT_COLLECT_STRUCTURE;
1611
0
  if (fz_has_option(ctx, string, "use-cid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
1612
0
    opts->flags |= FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE;
1613
0
  if (fz_has_option(ctx, string, "accurate-bboxes", &val) && fz_option_eq(val, "yes"))
1614
0
    opts->flags |= FZ_STEXT_ACCURATE_BBOXES;
1615
0
  if (fz_has_option(ctx, string, "vectors", &val) && fz_option_eq(val, "yes"))
1616
0
    opts->flags |= FZ_STEXT_COLLECT_VECTORS;
1617
0
  if (fz_has_option(ctx, string, "ignore-actualtext", & val) && fz_option_eq(val, "yes"))
1618
0
    opts->flags |= FZ_STEXT_IGNORE_ACTUALTEXT;
1619
0
  if (fz_has_option(ctx, string, "segment", &val) && fz_option_eq(val, "yes"))
1620
0
    opts->flags |= FZ_STEXT_SEGMENT;
1621
0
  if (fz_has_option(ctx, string, "paragraph-break", &val) && fz_option_eq(val, "yes"))
1622
0
    opts->flags |= FZ_STEXT_PARAGRAPH_BREAK;
1623
0
  if (fz_has_option(ctx, string, "table-hunt", &val) && fz_option_eq(val, "yes"))
1624
0
    opts->flags |= FZ_STEXT_TABLE_HUNT;
1625
0
  if (fz_has_option(ctx, string, "collect-styles", &val) && fz_option_eq(val, "yes"))
1626
0
    opts->flags |= FZ_STEXT_COLLECT_STYLES;
1627
1628
0
  opts->flags |= FZ_STEXT_CLIP;
1629
0
  if (fz_has_option(ctx, string, "mediabox-clip", &val))
1630
0
  {
1631
0
    fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead.");
1632
0
    if (fz_option_eq(val, "no"))
1633
0
      opts->flags ^= FZ_STEXT_CLIP;
1634
0
  }
1635
0
  if (fz_has_option(ctx, string, "clip", &val) && fz_option_eq(val, "no"))
1636
0
    opts->flags ^= FZ_STEXT_CLIP;
1637
0
  if (fz_has_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip))
1638
0
    opts->flags |= FZ_STEXT_CLIP_RECT;
1639
1640
0
  opts->scale = 1;
1641
0
  if (fz_has_option(ctx, string, "resolution", &val))
1642
0
    opts->scale = fz_atof(val) / 96.0f; /* HTML base resolution is 96ppi */
1643
1644
0
  return opts;
1645
0
}
1646
1647
typedef struct
1648
{
1649
  int fail;
1650
  int count;
1651
  fz_point corners[4];
1652
} is_rect_data;
1653
1654
static void
1655
stash_point(is_rect_data *rd, float x, float y)
1656
0
{
1657
0
  if (rd->count > 3)
1658
0
  {
1659
0
    rd->fail = 1;
1660
0
    return;
1661
0
  }
1662
1663
0
  rd->corners[rd->count].x = x;
1664
0
  rd->corners[rd->count].y = y;
1665
0
  rd->count++;
1666
0
}
1667
1668
static void
1669
is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
1670
0
{
1671
0
  is_rect_data *rd = arg;
1672
0
  if (rd->fail)
1673
0
    return;
1674
1675
0
  if (rd->count != 0)
1676
0
  {
1677
0
    rd->fail = 1;
1678
0
    return;
1679
0
  }
1680
0
  stash_point(rd, x, y);
1681
0
}
1682
1683
static void
1684
is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
1685
0
{
1686
0
  is_rect_data *rd = arg;
1687
0
  if (rd->fail)
1688
0
    return;
1689
1690
0
  if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
1691
0
    return;
1692
1693
0
  stash_point(rd, x, y);
1694
0
}
1695
1696
static void
1697
is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
1698
0
{
1699
0
  is_rect_data *rd = arg;
1700
0
  rd->fail = 1;
1701
0
}
1702
1703
static void
1704
is_rect_closepath(fz_context *ctx, void *arg)
1705
0
{
1706
0
  is_rect_data *rd = arg;
1707
0
  if (rd->fail)
1708
0
    return;
1709
0
  if (rd->count == 3)
1710
0
    stash_point(rd, rd->corners[0].x, rd->corners[0].y);
1711
0
  if (rd->count != 4)
1712
0
    rd->fail = 1;
1713
0
}
1714
1715
static int feq(float a,float b)
1716
0
{
1717
0
#define EPSILON 0.00001
1718
0
  a -= b;
1719
0
  if (a < 0)
1720
0
    a = -a;
1721
0
  return a < EPSILON;
1722
0
}
1723
1724
static int
1725
is_path_rect(fz_context *ctx, fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm)
1726
0
{
1727
0
  float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
1728
0
  is_rect_data rd = { 0 };
1729
0
  static const fz_path_walker walker =
1730
0
  {
1731
0
    is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
1732
0
  };
1733
0
  int i;
1734
1735
0
  fz_walk_path(ctx, path, &walker, &rd);
1736
1737
0
  if (rd.fail)
1738
0
    return 0;
1739
1740
0
  if (rd.count == 2)
1741
0
  {
1742
0
    stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
1743
0
    stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
1744
0
  }
1745
1746
0
  for (i = 0 ; i < 4; i++)
1747
0
  {
1748
0
    fz_point p = fz_transform_point(rd.corners[i], ctm);
1749
1750
0
    rd.corners[i].x = p.x;
1751
0
    rd.corners[i].y = p.y;
1752
0
  }
1753
1754
  /* So we have a 4 cornered path. Hopefully something like:
1755
   * 0---------1
1756
   * |         |
1757
   * 3---------2
1758
   * but it might be:
1759
   * 0---------3
1760
   * |         |
1761
   * 1---------2
1762
  */
1763
0
  while (1)
1764
0
  {
1765
0
    d01x = rd.corners[1].x - rd.corners[0].x;
1766
0
    d01y = rd.corners[1].y - rd.corners[0].y;
1767
0
    d01 = d01x * d01x + d01y * d01y;
1768
0
    d03x = rd.corners[3].x - rd.corners[0].x;
1769
0
    d03y = rd.corners[3].y - rd.corners[0].y;
1770
0
    d03 = d03x * d03x + d03y * d03y;
1771
0
    if(d01 < d03)
1772
0
    {
1773
      /* We are the latter case. Transpose it. */
1774
0
      fz_point p = rd.corners[1];
1775
0
      rd.corners[1] = rd.corners[3];
1776
0
      rd.corners[3] = p;
1777
0
    }
1778
0
    else
1779
0
      break;
1780
0
  }
1781
0
  d32x = rd.corners[2].x - rd.corners[3].x;
1782
0
  d32y = rd.corners[2].y - rd.corners[3].y;
1783
1784
  /* So d32x and d01x need to be the same for this to be a strikeout. */
1785
0
  if (!feq(d32x, d01x) || !feq(d32y, d01y))
1786
0
    return 0;
1787
1788
  /* We are plausibly a rectangle. */
1789
0
  *thickness = sqrtf(d03x * d03x + d03y * d03y);
1790
1791
0
  from->x = (rd.corners[0].x + rd.corners[3].x)/2;
1792
0
  from->y = (rd.corners[0].y + rd.corners[3].y)/2;
1793
0
  to->x = (rd.corners[1].x + rd.corners[2].x)/2;
1794
0
  to->y = (rd.corners[1].y + rd.corners[2].y)/2;
1795
1796
0
  return 1;
1797
0
}
1798
1799
static void
1800
advance_x(fz_point *a, fz_point b, float d)
1801
0
{
1802
0
  a->y += (b.y - a->y) * d / (b.x - a->x);
1803
0
  a->x += d;
1804
0
}
1805
1806
static void
1807
advance_y(fz_point *a, fz_point b, float d)
1808
0
{
1809
0
  a->x += (b.x - a->x) * d / (b.y - a->y);
1810
0
  a->y += d;
1811
0
}
1812
1813
static int
1814
line_crosses_rect(fz_point a, fz_point b, fz_rect r)
1815
0
{
1816
  /* Cope with trivial exclusions */
1817
0
  if (a.x < r.x0 && b.x < r.x0)
1818
0
    return 0;
1819
0
  if (a.x > r.x1 && b.x > r.x1)
1820
0
    return 0;
1821
0
  if (a.y < r.y0 && b.y < r.y0)
1822
0
    return 0;
1823
0
  if (a.y > r.y1 && b.y > r.y1)
1824
0
    return 0;
1825
1826
0
  if (a.x < r.x0)
1827
0
    advance_x(&a, b, r.x0 - a.x);
1828
0
  if (a.x > r.x1)
1829
0
    advance_x(&a, b, r.x1 - a.x);
1830
0
  if (a.y < r.y0)
1831
0
    advance_y(&a, b, r.y0 - a.y);
1832
0
  if (a.y > r.y1)
1833
0
    advance_y(&a, b, r.y1 - a.y);
1834
1835
0
  return fz_is_point_inside_rect(a, r);
1836
0
}
1837
1838
static float
1839
calculate_ascent(fz_point p, fz_point origin, fz_point dir)
1840
0
{
1841
0
  return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x);
1842
0
}
1843
1844
/* Create us a rect from the given quad, but extend it downwards
1845
 * to allow for underlines that pass under the glyphs. */
1846
static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size)
1847
0
{
1848
  /* Consider the two rects from A and g respectively.
1849
   *
1850
   * ul +------+ ur   or
1851
   *    |  /\  |         ul +------+ ur
1852
   *    | /__\ |            | /''\ |
1853
   *    |/    \|            |(    ||
1854
   * ll +------+ lr         | ''''||
1855
   *                        |  ''' | <-expected underline level
1856
   *                     ll +------+ lr
1857
   *
1858
   * So an underline won't cross A's rect, but will cross g's.
1859
   * We want to make a rect that includes a suitable amount of
1860
   * space underneath. The information we have available to us
1861
   * is summed up here:
1862
   *
1863
   *  ul +---------+ ur
1864
   *     |         |
1865
   *     | origin  |
1866
   *     |+----------> dir
1867
   *     |         |
1868
   *  ll +---------+ lr
1869
   *
1870
   * Consider the distance from ul to the line that passes through
1871
   * the origin with direction dir. Similarly, consider the distance
1872
   * from ur to the same line. This can be thought of as the 'ascent'
1873
   * of this character.
1874
   *
1875
   * We'd like the distance from ul to ll to be greater than this, so
1876
   * as to ensure we cover the possible location where an underline
1877
   * might reasonably go.
1878
   *
1879
   * If we have a line (l) through point A with direction vector u,
1880
   * the distance between point P and line(l) is:
1881
   *
1882
   * d(P,l) = || AP x u || / || u ||
1883
   *
1884
   * where x is the cross product.
1885
   *
1886
   * For us, because || dir || = 1:
1887
   *
1888
   * d(ul, origin) = || (origin-ul) x dir ||
1889
   *
1890
   * The cross product is only defined in 3 (or 7!) dimensions, so
1891
   * extend both vectors into 3d by defining a 0 z component.
1892
   *
1893
   * (origin-ul) x dir = [ (origin.y - ul.y) . 0     - 0                 . dir.y ]
1894
   *                     [ 0                 . dir.x - (origin.x - ul.y) . 0     ]
1895
   *                     [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ]
1896
   *
1897
   * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x
1898
   */
1899
0
  float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2;
1900
0
  fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y };
1901
0
  fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y };
1902
0
  float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2;
1903
0
  int neg = 0;
1904
1905
  /* We'd like height to be at least ascent + 1/4 size */
1906
0
  if (height < 0)
1907
0
    neg = 1, height = -height;
1908
0
  if (height < ascent + size * 0.25f)
1909
0
    height = ascent + size * 0.25f;
1910
1911
0
  height -= ascent;
1912
0
  if (neg)
1913
0
    height = -height;
1914
0
  quad.ll.x += - height * dir.y;
1915
0
  quad.ll.y +=   height * dir.x;
1916
0
  quad.lr.x += - height * dir.y;
1917
0
  quad.lr.y +=   height * dir.x;
1918
1919
0
  return fz_rect_from_quad(quad);
1920
0
}
1921
1922
static void
1923
check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm)
1924
0
{
1925
0
  fz_stext_block *block = page->last_block;
1926
0
  int is_rect;
1927
0
  float thickness;
1928
0
  fz_point from, to, dir;
1929
0
  union {
1930
0
    fz_path *p;
1931
0
    const fz_path *cp;
1932
0
  } u;
1933
1934
0
  u.cp = path;
1935
1936
  /* Is this path a thin rectangle (possibly rotated)? If so, then we need to
1937
   * consider it as being a strikeout or underline. */
1938
0
  is_rect = is_path_rect(ctx, u.p, &from, &to, &thickness, ctm);
1939
0
  if (!is_rect)
1940
0
    return;
1941
1942
0
  dir.x = to.x - from.x;
1943
0
  dir.y = to.y - from.y;
1944
0
  dir = fz_normalize_vector(dir);
1945
1946
  /* Does this line nicely cover a recent span? */
1947
0
  while (block)
1948
0
  {
1949
0
    fz_stext_line *line;
1950
0
    if (block->type != FZ_STEXT_BLOCK_TEXT)
1951
0
    {
1952
0
      block = block->prev;
1953
0
      continue;
1954
0
    }
1955
0
    line = block->u.t.last_line;
1956
0
    while(line)
1957
0
    {
1958
0
      if ((feq(line->dir.x, dir.x) && feq(line->dir.y, dir.y)) ||
1959
0
        (feq(line->dir.x, -dir.x) && feq(line->dir.y, -dir.y)))
1960
0
      {
1961
        /* Matching directions... */
1962
1963
        /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
1964
         * chars. */
1965
0
        fz_stext_char *ch;
1966
0
        for (ch = line->first_char; ch; ch = ch->next)
1967
0
        {
1968
0
          fz_rect ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size);
1969
1970
0
          if (line_crosses_rect(from, to, ch_box))
1971
0
          {
1972
0
            float dx, dy, dot;
1973
            /* Is this a strikeout or an underline? */
1974
1975
            /* The baseline moves from ch->origin in the direction line->dir */
1976
0
            fz_point up;
1977
0
            up.x = line->dir.y;
1978
0
            up.y = -line->dir.x;
1979
1980
            /* How far is our line displaced from the line through the origin? */
1981
0
            dx = from.x - ch->origin.x;
1982
0
            dy = from.y - ch->origin.y;
1983
            /* Dot product with up. up is normalised */
1984
0
            dot = dx * up.x + dy * up.y;
1985
1986
0
            if (dot > 0)
1987
0
              ch->flags |= FZ_STEXT_STRIKEOUT;
1988
0
            else
1989
0
              ch->flags |= FZ_STEXT_UNDERLINE;
1990
0
          }
1991
0
        }
1992
0
      }
1993
0
      line = line->prev;
1994
0
    }
1995
1996
0
    block = block->prev;
1997
0
  }
1998
0
}
1999
2000
static void
2001
add_vector(fz_context *ctx, fz_stext_page *page, fz_rect bbox, int stroked, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2002
0
{
2003
0
  fz_stext_block *b = add_block_to_page(ctx, page);
2004
2005
0
  b->type = FZ_STEXT_BLOCK_VECTOR;
2006
0
  b->bbox = bbox;
2007
0
  b->u.v.stroked = stroked;
2008
0
  b->u.v.argb = hexrgba_from_color(ctx, cs, color, alpha);
2009
0
}
2010
2011
static void
2012
fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2013
0
{
2014
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2015
0
  fz_stext_page *page = tdev->page;
2016
0
  fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
2017
0
  fz_rect *bounds = actualtext_bounds(tdev);
2018
2019
  /* If we're in an actualttext, then update the bounds to include this content. */
2020
0
  if (bounds != NULL)
2021
0
    *bounds = fz_union_rect(*bounds, path_bounds);
2022
2023
0
  if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2024
0
    check_for_strikeout(ctx, tdev, page, path, ctm);
2025
2026
0
  if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2027
0
    add_vector(ctx, page, path_bounds, 0, cs, color, alpha, cp);
2028
0
}
2029
2030
static void
2031
fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2032
0
{
2033
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2034
0
  fz_stext_page *page = tdev->page;
2035
0
  fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm);
2036
0
  fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
2037
2038
  /* If we're in an actualttext, then update the bounds to include this content. */
2039
0
  if (bounds != NULL)
2040
0
    *bounds = fz_union_rect(*bounds, path_bounds);
2041
2042
0
  if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2043
0
    check_for_strikeout(ctx, tdev, page, path, ctm);
2044
2045
0
  if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2046
0
    add_vector(ctx, page, path_bounds, 1, cs, color, alpha, cp);
2047
0
}
2048
2049
static void
2050
new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
2051
0
{
2052
0
  fz_stext_struct *str;
2053
0
  size_t z;
2054
2055
0
  if (raw == NULL)
2056
0
    raw = "";
2057
0
  z = strlen(raw);
2058
2059
0
  str = fz_pool_alloc(ctx, page->pool, sizeof(*str) + z);
2060
0
  str->first_block = NULL;
2061
0
  str->last_block = NULL;
2062
0
  str->standard = standard;
2063
0
  str->parent = page->last_struct;
2064
0
  str->up = block;
2065
0
  memcpy(str->raw, raw, z+1);
2066
2067
0
  block->u.s.down = str;
2068
0
}
2069
2070
static void
2071
fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx)
2072
0
{
2073
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2074
0
  fz_stext_page *page = tdev->page;
2075
0
  fz_stext_block *block, *le, *gt, *newblock;
2076
2077
  /* Find a pointer to the last block. */
2078
0
  if (page->last_block)
2079
0
  {
2080
0
    block = page->last_block;
2081
0
  }
2082
0
  else if (page->last_struct)
2083
0
  {
2084
0
    block = page->last_struct->last_block;
2085
0
  }
2086
0
  else
2087
0
  {
2088
0
    block = page->first_block;
2089
0
  }
2090
2091
  /* So block is somewhere in the content chain. Let's try and find:
2092
   *   le = the struct node <= idx before block in the content chain.
2093
   *   ge = the struct node >= idx after block in the content chain.
2094
   * Search backwards to start with.
2095
   */
2096
0
  gt = NULL;
2097
0
  le = block;
2098
0
  while (le)
2099
0
  {
2100
0
    if (le->type == FZ_STEXT_BLOCK_STRUCT)
2101
0
    {
2102
0
      if (le->u.s.index > idx)
2103
0
        gt = le;
2104
0
      if (le->u.s.index <= idx)
2105
0
        break;
2106
0
    }
2107
0
    le = le->prev;
2108
0
  }
2109
  /* The following loop copes with finding gt (the smallest block with an index higher
2110
   * than we want) if we haven't found it already. The while loop in here was designed
2111
   * to cope with 'block' being in the middle of a list. In fact, the way the code is
2112
   * currently, block will always be at the end of a list, so the while won't do anything.
2113
   * But I'm loathe to remove it in case we ever change this code to start from wherever
2114
   * we did the last insertion. */
2115
0
  if (gt == NULL)
2116
0
  {
2117
0
    gt = block;
2118
0
    while (gt)
2119
0
    {
2120
0
      if (gt->type == FZ_STEXT_BLOCK_STRUCT)
2121
0
      {
2122
0
        if (gt->u.s.index <= idx)
2123
0
          le = gt;
2124
0
        if (gt->u.s.index >= idx)
2125
0
          break;
2126
0
      }
2127
0
      block = gt;
2128
0
      gt = gt->next;
2129
0
    }
2130
0
  }
2131
2132
0
  if (le && le->u.s.index == idx)
2133
0
  {
2134
    /* We want to move down into the le block. Does it have a struct
2135
     * attached yet? */
2136
0
    if (le->u.s.down == NULL)
2137
0
    {
2138
      /* No. We need to create a new struct node. */
2139
0
      new_stext_struct(ctx, page, le, standard, raw);
2140
0
    }
2141
0
    else if (le->u.s.down->standard != standard ||
2142
0
        (raw == NULL && le->u.s.down->raw[0] != 0) ||
2143
0
        (raw != NULL && strcmp(raw, le->u.s.down->raw) != 0))
2144
0
    {
2145
      /* Yes, but it doesn't match the one we expect! */
2146
0
      fz_warn(ctx, "Mismatched structure type!");
2147
0
    }
2148
0
    page->last_struct = le->u.s.down;
2149
0
    page->last_block = le->u.s.down->last_block;
2150
2151
0
    return;
2152
0
  }
2153
2154
  /* We are going to need to create a new block. Create a complete unlinked one here. */
2155
0
  newblock = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
2156
0
  newblock->bbox = fz_empty_rect;
2157
0
  newblock->prev = NULL;
2158
0
  newblock->next = NULL;
2159
0
  newblock->type = FZ_STEXT_BLOCK_STRUCT;
2160
0
  newblock->u.s.index = idx;
2161
0
  newblock->u.s.down = NULL;
2162
  /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
2163
0
  new_stext_struct(ctx, page, newblock, standard, raw);
2164
2165
  /* So now we just need to link it in somewhere. */
2166
0
  if (gt)
2167
0
  {
2168
    /* Link it in before gt. */
2169
0
    newblock->prev = gt->prev;
2170
0
    if (gt->prev)
2171
0
      gt->prev->next = newblock;
2172
0
    gt->prev = newblock;
2173
0
    newblock->next = gt;
2174
0
  }
2175
0
  else if (block)
2176
0
  {
2177
    /* Link it in at the end of the list (i.e. after 'block') */
2178
0
    newblock->prev = block;
2179
0
    block->next = newblock;
2180
0
  }
2181
0
  else if (page->last_struct)
2182
0
  {
2183
    /* We have no blocks at all at this level. */
2184
0
    page->last_struct->first_block = newblock;
2185
0
    page->last_struct->last_block = newblock;
2186
0
  }
2187
0
  else
2188
0
  {
2189
    /* We have no blocks at ANY level. */
2190
0
    page->first_block = newblock;
2191
0
  }
2192
  /* Whereever we linked it in, that's where we want to continue adding content. */
2193
0
  page->last_struct = newblock->u.s.down;
2194
0
  page->last_block = NULL;
2195
0
}
2196
2197
static void
2198
fz_stext_end_structure(fz_context *ctx, fz_device *dev)
2199
0
{
2200
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2201
0
  fz_stext_page *page = tdev->page;
2202
0
  fz_stext_struct *str = page->last_struct;
2203
2204
0
  if (str == NULL)
2205
0
  {
2206
0
    fz_warn(ctx, "Structure out of sync");
2207
0
    return;
2208
0
  }
2209
2210
0
  page->last_struct = str->parent;
2211
0
  if (page->last_struct == NULL)
2212
0
  {
2213
0
    page->last_block = page->first_block;
2214
    /* Yuck */
2215
0
    while (page->last_block->next)
2216
0
      page->last_block = page->last_block->next;
2217
0
  }
2218
0
  else
2219
0
  {
2220
0
    page->last_block = page->last_struct->last_block;
2221
0
  }
2222
0
}
2223
2224
fz_device *
2225
fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
2226
0
{
2227
0
  fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
2228
2229
0
  dev->super.close_device = fz_stext_close_device;
2230
0
  dev->super.drop_device = fz_stext_drop_device;
2231
2232
0
  dev->super.fill_text = fz_stext_fill_text;
2233
0
  dev->super.stroke_text = fz_stext_stroke_text;
2234
0
  dev->super.clip_text = fz_stext_clip_text;
2235
0
  dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
2236
0
  dev->super.ignore_text = fz_stext_ignore_text;
2237
0
  dev->super.begin_metatext = fz_stext_begin_metatext;
2238
0
  dev->super.end_metatext = fz_stext_end_metatext;
2239
2240
0
  dev->super.fill_shade = fz_stext_fill_shade;
2241
0
  dev->super.fill_image = fz_stext_fill_image;
2242
0
  dev->super.fill_image_mask = fz_stext_fill_image_mask;
2243
2244
0
  if (opts)
2245
0
  {
2246
0
    dev->flags = opts->flags;
2247
0
    if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE)
2248
0
    {
2249
0
      dev->super.begin_structure = fz_stext_begin_structure;
2250
0
      dev->super.end_structure = fz_stext_end_structure;
2251
0
    }
2252
0
    if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES))
2253
0
    {
2254
0
      dev->super.fill_path = fz_stext_fill_path;
2255
0
      dev->super.stroke_path = fz_stext_stroke_path;
2256
0
    }
2257
0
  }
2258
0
  dev->page = page;
2259
0
  dev->pen.x = 0;
2260
0
  dev->pen.y = 0;
2261
0
  dev->trm = fz_identity;
2262
0
  dev->lastchar = ' ';
2263
0
  dev->lasttext = NULL;
2264
0
  dev->lastbidi = 0;
2265
0
  dev->last_was_fake_bold = 1;
2266
0
  if (opts)
2267
0
    dev->opts = *opts;
2268
2269
0
  if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
2270
0
    dev->super.hints |= FZ_DONT_DECODE_IMAGES;
2271
2272
0
  return (fz_device*)dev;
2273
0
}