Coverage Report

Created: 2026-06-30 07:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/fitz/stext-device.c
Line
Count
Source
1
// Copyright (C) 2004-2026 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
// You should have received a copy of the GNU Affero General Public License
15
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
16
//
17
// Alternative licensing terms are available from the licensor.
18
// For commercial licensing, see <https://www.artifex.com/> or contact
19
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
20
// CA 94129, USA, for further information.
21
22
#include "mupdf/fitz.h"
23
24
#include "glyphbox.h"
25
26
#include "mupdf/ucdn.h"
27
28
#include <float.h>
29
#include <string.h>
30
31
/* Simple layout structure */
32
33
fz_layout_block *fz_new_layout(fz_context *ctx)
34
0
{
35
0
  fz_pool *pool = fz_new_pool(ctx);
36
0
  fz_layout_block *block;
37
0
  fz_try(ctx)
38
0
  {
39
0
    block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
40
0
    block->pool = pool;
41
0
    block->head = NULL;
42
0
    block->tailp = &block->head;
43
0
  }
44
0
  fz_catch(ctx)
45
0
  {
46
0
    fz_drop_pool(ctx, pool);
47
0
    fz_rethrow(ctx);
48
0
  }
49
0
  return block;
50
0
}
51
52
void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
53
0
{
54
0
  if (block)
55
0
    fz_drop_pool(ctx, block->pool);
56
0
}
57
58
void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p)
59
0
{
60
0
  fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
61
0
  line->x = x;
62
0
  line->y = y;
63
0
  line->font_size = font_size;
64
0
  line->p = p;
65
0
  line->text = NULL;
66
0
  line->next = NULL;
67
0
  *block->tailp = line;
68
0
  block->tailp = &line->next;
69
0
  block->text_tailp = &line->text;
70
0
}
71
72
void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p)
73
0
{
74
0
  fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
75
0
  ch->x = x;
76
0
  ch->advance = advance;
77
0
  ch->p = p;
78
0
  ch->next = NULL;
79
0
  *block->text_tailp = ch;
80
0
  block->text_tailp = &ch->next;
81
0
}
82
83
/* Extract text into blocks and lines. */
84
85
0
#define PARAGRAPH_DIST 1.5f
86
0
#define SPACE_DIST 0.15f
87
0
#define SPACE_MAX_DIST 0.8f
88
0
#define BASE_MAX_DIST 0.8f
89
0
#define FAKE_BOLD_MAX_DIST 0.1f
90
91
/* We keep a stack of the different metatexts that apply at any
92
 * given point (normally none!). Whenever we get some content
93
 * with a metatext in force, we really want to update the bounds
94
 * for that metatext. But running along the whole list each time
95
 * would be painful. So we just update the bounds for dev->metatext
96
 * and rely on metatext_bounds() propagating it upwards 'just in
97
 * time' for us to use metatexts other than the latest one. This
98
 * also means we need to propagate bounds upwards when we pop
99
 * a metatext.
100
 *
101
 * Why do we need bounds at all? Well, suppose we get:
102
 *    /Span <</ActualText (c) >> BDC /Im0 Do EMC
103
 * Then where on the page do we put 'c' ? By collecting the
104
 * bounds, we can place 'c' wherever the image was.
105
 */
106
typedef struct metatext_t
107
{
108
  fz_metatext type;
109
  char *text;
110
  fz_rect bounds;
111
  struct metatext_t *prev;
112
} metatext_t;
113
114
typedef struct
115
{
116
  fz_point from;
117
  fz_point to;
118
  float thickness;
119
  fz_rect rect;
120
  int argb;
121
} rect_details;
122
123
typedef struct
124
{
125
  fz_device super;
126
  fz_stext_page *page;
127
  int id;
128
  fz_point pen, start;
129
  // maybe_bullet: True if the 'start' position recorded was done so after either some actualtext
130
  // on an image, or after a glyph that's known to be used for bullets. This is used to stop us
131
  // spotting an 'indented' paragraph, because it's possibly just a bulleted list.
132
  int maybe_bullet;
133
  fz_point lag_pen;
134
  fz_matrix trm;
135
  int lastchar;
136
  fz_stext_line *lastline;
137
  int lastbidi;
138
  int flags;
139
  int color;
140
  int last_was_fake_bold;
141
  const fz_text *lasttext;
142
  fz_stext_options opts;
143
144
  metatext_t *metatext;
145
146
  /* Store the last values we saw. We need this for flushing the actualtext. */
147
  struct
148
  {
149
    int valid;
150
    int clipped;
151
    fz_matrix trm;
152
    int wmode;
153
    int bidi_level;
154
    fz_font *font;
155
    int flags;
156
  } last;
157
158
  /* The list of 'rects' seen during processing (if we're collecting styles). */
159
  int rect_max;
160
  int rect_len;
161
  rect_details *rects;
162
163
  fz_stext_block *lazy_vectors;
164
  fz_stext_block *lazy_vectors_tail;
165
} fz_stext_device;
166
167
const char *fz_stext_options_usage =
168
  "Structured text options:\n"
169
  "\tpreserve-images: keep images in output\n"
170
  "\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
171
  "\tpreserve-spans: do not merge spans on the same line\n"
172
  "\tpreserve-whitespace: do not convert all whitespace into space characters\n"
173
  "\tinhibit-spaces: don't add spaces between gaps in the text\n"
174
  "\tparagraph-break: break blocks at paragraph boundaries\n"
175
  "\tdehyphenate: attempt to join up hyphenated words\n"
176
  "\tignore-actualtext: do not apply ActualText replacements\n"
177
  "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n"
178
  "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n"
179
  "\taccurate-bboxes: calculate char bboxes from the outlines\n"
180
  "\taccurate-ascenders: calculate ascender/descender from font glyphs\n"
181
  "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n"
182
  "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n"
183
  "\tclip: do not include text that is completely clipped\n"
184
  "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n"
185
  "\tstructured: collect structure markup\n"
186
  "\tvectors: include vector bboxes in output\n"
187
  "\tlazy-vectors: delay vectors that would otherwise split a text line\n"
188
  "\tfuzzy-vectors: merge abutting horizontal/vertical vectors\n"
189
  "\tsegment: attempt to segment the page\n"
190
  "\ttable-hunt: hunt for tables within a (segmented) page\n"
191
  "\tresolution: resolution to render at\n"
192
  "\n";
193
194
/* Find the current actualtext, if any. Will abort if dev == NULL. */
195
static metatext_t *
196
find_actualtext(fz_stext_device *dev)
197
0
{
198
0
  metatext_t *mt = dev->metatext;
199
200
0
  while (mt && mt->type != FZ_METATEXT_ACTUALTEXT)
201
0
    mt = mt->prev;
202
203
0
  return mt;
204
0
}
205
206
/* Find the bounds of the given metatext. Will abort if mt or
207
 * dev are NULL. */
208
static fz_rect *
209
metatext_bounds(metatext_t *mt, fz_stext_device *dev)
210
0
{
211
0
  metatext_t *mt2 = dev->metatext;
212
213
0
  while (mt2 != mt)
214
0
  {
215
0
    mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds);
216
0
    mt2 = mt2->prev;
217
0
  }
218
219
0
  return &mt->bounds;
220
0
}
221
222
/* Find the bounds of the current actualtext, or NULL if there
223
 * isn't one. Will abort if dev is NULL. */
224
static fz_rect *
225
actualtext_bounds(fz_stext_device *dev)
226
0
{
227
0
  metatext_t *mt = find_actualtext(dev);
228
229
0
  if (mt == NULL)
230
0
    return NULL;
231
232
0
  return metatext_bounds(mt, dev);
233
0
}
234
235
fz_stext_page *
236
fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
237
0
{
238
0
  fz_pool *pool = fz_new_pool(ctx);
239
0
  fz_stext_page *page = NULL;
240
0
  fz_try(ctx)
241
0
  {
242
0
    page = fz_pool_alloc(ctx, pool, sizeof(*page));
243
0
    page->refs = 1;
244
0
    page->pool = pool;
245
0
    page->mediabox = mediabox;
246
0
    page->first_block = NULL;
247
0
    page->last_block = NULL;
248
0
    page->id_list = fz_new_pool_array(ctx, pool, fz_stext_page_details, 4);
249
0
  }
250
0
  fz_catch(ctx)
251
0
  {
252
0
    fz_drop_pool(ctx, pool);
253
0
    fz_rethrow(ctx);
254
0
  }
255
0
  return page;
256
0
}
257
258
static void
259
drop_run(fz_context *ctx, fz_stext_block *block)
260
0
{
261
0
  fz_stext_line *line;
262
0
  fz_stext_char *ch;
263
0
  while (block)
264
0
  {
265
0
    switch (block->type)
266
0
    {
267
0
    case FZ_STEXT_BLOCK_IMAGE:
268
0
      fz_drop_image(ctx, block->u.i.image);
269
0
      break;
270
0
    case FZ_STEXT_BLOCK_TEXT:
271
0
      for (line = block->u.t.first_line; line; line = line->next)
272
0
        for (ch = line->first_char; ch; ch = ch->next)
273
0
          fz_drop_font(ctx, ch->font);
274
0
      break;
275
0
    case FZ_STEXT_BLOCK_STRUCT:
276
0
      drop_run(ctx, block->u.s.down->first_block);
277
0
      break;
278
0
    default:
279
0
      break;
280
0
    }
281
0
    block = block->next;
282
0
  }
283
0
}
284
285
fz_stext_page_details *fz_stext_page_details_for_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block)
286
0
{
287
0
  if (block == NULL || page == NULL)
288
0
    fz_throw(ctx, FZ_ERROR_ARGUMENT, "page details require a page and a block");
289
290
0
  return (fz_stext_page_details *)fz_pool_array_lookup(ctx, page->id_list, block->id);
291
0
}
292
293
fz_stext_page *
294
fz_keep_stext_page(fz_context *ctx, fz_stext_page *page)
295
0
{
296
0
  return fz_keep_imp(ctx, page, &page->refs);
297
0
}
298
299
void
300
fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
301
0
{
302
0
  if (page == NULL)
303
0
    return;
304
305
0
  if (fz_drop_imp(ctx, page, &page->refs))
306
0
  {
307
0
    drop_run(ctx, page->first_block);
308
0
    fz_drop_pool(ctx, page->pool);
309
0
  }
310
0
}
311
312
/*
313
 * This adds a new block at the end of the page. This should not be used
314
 * to add 'struct' blocks to the page as those have to be added internally,
315
 * with more complicated pointer setup.
316
 */
317
static fz_stext_block *
318
add_block_to_page(fz_context *ctx, fz_stext_page *page, int type, int id)
319
0
{
320
0
  fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
321
0
  block->bbox = fz_empty_rect; /* Fixes bug 703267. */
322
0
  block->prev = page->last_block;
323
0
  block->type = type;
324
0
  block->id = id;
325
0
  if (page->last_struct)
326
0
  {
327
0
    if (page->last_struct->last_block)
328
0
    {
329
0
      block->prev = page->last_struct->last_block;
330
0
      block->prev->next = block;
331
0
      page->last_struct->last_block = block;
332
0
    }
333
0
    else
334
0
      page->last_struct->last_block = page->last_struct->first_block = block;
335
0
  }
336
0
  else if (!page->last_block)
337
0
  {
338
0
    assert(!page->first_block);
339
0
    page->first_block = page->last_block = block;
340
0
  }
341
0
  else
342
0
  {
343
0
    page->last_block->next = block;
344
0
    page->last_block = block;
345
0
  }
346
0
  return block;
347
0
}
348
349
static fz_stext_block *
350
add_lazy_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, int id)
351
0
{
352
0
  fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
353
0
  block->bbox = fz_empty_rect;
354
0
  block->prev = tdev->lazy_vectors_tail;
355
0
  block->type = FZ_STEXT_BLOCK_VECTOR;
356
0
  block->id = id;
357
358
0
  if (tdev->lazy_vectors == NULL)
359
0
    tdev->lazy_vectors = block;
360
0
  else
361
0
    tdev->lazy_vectors_tail->next = block;
362
0
  tdev->lazy_vectors_tail = block;
363
364
0
  return block;
365
0
}
366
367
static void
368
flush_lazy_vectors(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev)
369
0
{
370
0
  if (tdev->lazy_vectors == NULL)
371
0
    return;
372
373
0
  if (page->last_struct)
374
0
  {
375
0
    if (page->last_struct->last_block)
376
0
    {
377
0
      page->last_struct->last_block->next = tdev->lazy_vectors;
378
0
      tdev->lazy_vectors->prev = page->last_struct->last_block;
379
0
      page->last_struct->last_block = tdev->lazy_vectors_tail;
380
0
    }
381
0
    else
382
0
    {
383
0
      page->last_struct->first_block = tdev->lazy_vectors;
384
0
      page->last_struct->last_block = tdev->lazy_vectors_tail;
385
0
    }
386
0
  }
387
0
  else if (!page->last_block)
388
0
  {
389
0
    page->first_block = tdev->lazy_vectors;
390
0
    page->last_block = tdev->lazy_vectors_tail;
391
0
  }
392
0
  else
393
0
  {
394
0
    page->last_block->next = tdev->lazy_vectors;
395
0
    tdev->lazy_vectors->prev = page->last_block;
396
0
    page->last_block = tdev->lazy_vectors_tail;
397
0
  }
398
399
0
  tdev->lazy_vectors = tdev->lazy_vectors_tail = NULL;
400
0
}
401
402
static fz_stext_block *
403
add_text_block_to_page(fz_context *ctx, fz_stext_page *page, int id)
404
0
{
405
0
  return add_block_to_page(ctx, page, FZ_STEXT_BLOCK_TEXT, id);
406
0
}
407
408
static fz_stext_block *
409
add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image, int id)
410
0
{
411
0
  fz_stext_block *block = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_IMAGE, id);
412
0
  block->u.i.transform = ctm;
413
0
  block->u.i.image = fz_keep_image(ctx, image);
414
0
  block->bbox = fz_transform_rect(fz_unit_rect, ctm);
415
0
  return block;
416
0
}
417
418
static fz_stext_line *
419
add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi)
420
0
{
421
0
  fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
422
0
  line->prev = block->u.t.last_line;
423
0
  if (!block->u.t.first_line)
424
0
    block->u.t.first_line = block->u.t.last_line = line;
425
0
  else
426
0
  {
427
0
    block->u.t.last_line->next = line;
428
0
    block->u.t.last_line = line;
429
0
  }
430
431
0
  line->dir = *dir;
432
0
  line->wmode = wmode;
433
434
0
  return line;
435
0
}
436
437
0
#define NON_ACCURATE_GLYPH_ADDED_SPACE (-2)
438
0
#define NON_ACCURATE_GLYPH (-1)
439
440
static fz_stext_char *
441
add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags)
442
0
{
443
0
  fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
444
0
  fz_point a, d;
445
446
0
  if (!line->first_char)
447
0
    line->first_char = line->last_char = ch;
448
0
  else
449
0
  {
450
0
    line->last_char->next = ch;
451
0
    line->last_char = ch;
452
0
  }
453
454
0
  ch->c = c;
455
0
  ch->argb = color;
456
0
  ch->bidi = bidi;
457
0
  ch->origin = *p;
458
0
  ch->size = size;
459
0
  ch->font = fz_keep_font(ctx, font);
460
0
  ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0) | (synthetic > 1 ? FZ_STEXT_SYNTHETIC_LARGE : 0);
461
0
  if (font->flags.is_bold)
462
0
    ch->flags |= FZ_STEXT_BOLD;
463
464
0
  if (line->wmode == 0)
465
0
  {
466
0
    fz_rect bounds;
467
0
    int bounded = 0;
468
0
    a.x = 0;
469
0
    d.x = 0;
470
0
    if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE)
471
0
    {
472
      /* Added space, in accurate mode. */
473
0
      a.y = d.y = 0;
474
0
    }
475
0
    else if (glyph == NON_ACCURATE_GLYPH)
476
0
    {
477
      /* Non accurate mode. */
478
0
      a.y = fz_font_ascender(ctx, font);
479
0
      d.y = fz_font_descender(ctx, font);
480
0
    }
481
0
    else
482
0
    {
483
      /* Any glyph in accurate mode */
484
0
      bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
485
0
      bounded = 1;
486
0
      a.y = bounds.y1;
487
0
      d.y = bounds.y0;
488
0
    }
489
0
    if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS)
490
0
    {
491
0
      if (!bounded)
492
0
        bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
493
0
      if (a.x > bounds.x0)
494
0
        a.x = bounds.x0;
495
0
      if (d.y < bounds.x1)
496
0
        d.y = bounds.x1;
497
0
    }
498
0
  }
499
0
  else
500
0
  {
501
0
    a.x = 1;
502
0
    d.x = 0;
503
0
    a.y = 0;
504
0
    d.y = 0;
505
0
  }
506
0
  a = fz_transform_vector(a, trm);
507
0
  d = fz_transform_vector(d, trm);
508
509
0
  ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
510
0
  ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
511
0
  ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
512
0
  ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
513
514
0
  return ch;
515
0
}
516
517
static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail)
518
0
{
519
0
  fz_stext_char *prev, *next;
520
0
  prev = tail;
521
0
  while (curr != tail)
522
0
  {
523
0
    next = curr->next;
524
0
    curr->next = prev;
525
0
    prev = curr;
526
0
    curr = next;
527
0
  }
528
0
  return prev;
529
0
}
530
531
static void reverse_bidi_line(fz_stext_line *line)
532
0
{
533
0
  fz_stext_char *a, *b, **prev;
534
0
  prev = &line->first_char;
535
0
  for (a = line->first_char; a; a = a->next)
536
0
  {
537
0
    if (a->bidi)
538
0
    {
539
0
      b = a;
540
0
      while (b->next && b->next->bidi)
541
0
        b = b->next;
542
0
      if (a != b)
543
0
        *prev = reverse_bidi_span(a, b->next);
544
0
    }
545
0
    prev = &a->next;
546
0
    line->last_char = a;
547
0
  }
548
0
}
549
550
int fz_is_unicode_hyphen(int c)
551
0
{
552
  /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
553
0
  return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
554
0
}
555
556
static float
557
vec_dot(const fz_point *a, const fz_point *b)
558
0
{
559
0
  return a->x * b->x + a->y * b->y;
560
0
}
561
562
static int may_add_space(int lastchar)
563
0
{
564
  /* Basic latin, greek, cyrillic, hebrew, arabic,
565
   * general punctuation,
566
   * superscripts and subscripts,
567
   * and currency symbols.
568
   */
569
0
  return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF)));
570
0
}
571
572
0
#define FAKEBOLD_THRESHOLD_RECIP (1.0f / FAKE_BOLD_MAX_DIST)
573
574
static int
575
is_within_fake_bold_distance(float a, float b, float size)
576
0
{
577
0
  a -= b;
578
0
  if (a < 0)
579
0
    a = -a;
580
581
0
  return FAKEBOLD_THRESHOLD_RECIP * a < size;
582
0
}
583
584
static int
585
font_equiv(fz_context *ctx, fz_font *f, fz_font *g)
586
0
{
587
0
  unsigned char fdigest[16];
588
0
  unsigned char gdigest[16];
589
590
0
  if (f == g)
591
0
    return 1;
592
593
0
  if (strcmp(f->name, g->name) != 0)
594
0
    return 0;
595
596
0
  if (f->buffer == NULL || g->buffer == NULL)
597
0
    return 0;
598
599
0
  fz_font_digest(ctx, f, fdigest);
600
0
  fz_font_digest(ctx, g, gdigest);
601
602
0
  return (memcmp(fdigest, gdigest, 16) == 0);
603
0
}
604
605
static int
606
check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags)
607
0
{
608
0
  fz_stext_line *line;
609
0
  fz_stext_char *ch;
610
611
0
  for (; block != NULL; block = block->next)
612
0
  {
613
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
614
0
    {
615
0
      if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags))
616
0
        return 1;
617
0
    }
618
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
619
0
    {
620
0
      for (line = block->u.t.first_line; line != NULL; line = line->next)
621
0
      {
622
0
        fz_stext_char *pr = NULL;
623
0
        for (ch = line->first_char; ch != NULL; ch = ch->next)
624
0
        {
625
          /* Not perfect, but it'll do! */
626
0
          if (ch->c == c && is_within_fake_bold_distance(ch->origin.x, p.x, size) && is_within_fake_bold_distance(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font))
627
0
          {
628
            /* If we were filled before, and we are stroking now... */
629
0
            if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED &&
630
0
              (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED)
631
0
            {
632
              /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */
633
0
              ch->flags |= flags;
634
0
              return 1;
635
0
            }
636
            /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these
637
             * as boldening if either the char before, or the char after were also boldened. */
638
0
            ch->flags |= flags;
639
640
0
            if (c == ' ')
641
0
            {
642
0
              if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) ||
643
0
                (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0))
644
0
              {
645
                /* OK, we can be bold. */
646
0
                ch->flags |= FZ_STEXT_BOLD;
647
0
              }
648
              /* Whether we have recorded this as being bold or not, still
649
               * claim we did, so we swallow the space and don't reemit it. */
650
0
              return 1;
651
0
            }
652
0
            else
653
0
            {
654
0
              ch->flags |= FZ_STEXT_BOLD;
655
0
              return 1;
656
0
            }
657
0
          }
658
0
          pr = ch;
659
0
        }
660
0
      }
661
0
    }
662
0
  }
663
664
0
  return 0;
665
0
}
666
667
static int
668
plausible_bullet(int c)
669
0
{
670
0
  return (c == '*' ||
671
0
    c == 0x00B7 || /* Middle Dot */
672
0
    c == 0x2022 || /* Bullet */
673
0
    c == 0x2023 || /* Triangular Bullet */
674
0
    c == 0x2043 || /* Hyphen Bullet */
675
0
    c == 0x204C || /* Back leftwards bullet */
676
0
    c == 0x204D || /* Back rightwards bullet */
677
0
    c == 0x2219 || /* Bullet operator */
678
0
    c == 0x25C9 || /* Fisheye */
679
0
    c == 0x25CB || /* White circle */
680
0
    c == 0x25CF || /* Black circle */
681
0
    c == 0x25D8 || /* Inverse Bullet */
682
0
    c == 0x25E6 || /* White Bullet */
683
0
    c == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */
684
0
    c == 0x261a || /* Black left pointing index */
685
0
    c == 0x261b || /* Black right pointing index */
686
0
    c == 0x261c || /* White left pointing index */
687
0
    c == 0x261d || /* White up pointing index */
688
0
    c == 0x261e || /* White right pointing index */
689
0
    c == 0x261f || /* White down pointing index */
690
0
    c == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */
691
0
    c == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */
692
0
    c == 0x29BE || /* Circled White Bullet */
693
0
    c == 0x29BF || /* Circled Bullet */
694
0
    c == 0x2660 || /* Black Spade suit */
695
0
    c == 0x2661 || /* White Heart suit */
696
0
    c == 0x2662 || /* White Diamond suit */
697
0
    c == 0x2663 || /* Black Club suit */
698
0
    c == 0x2664 || /* White Spade suit */
699
0
    c == 0x2665 || /* Black Heart suit */
700
0
    c == 0x2666 || /* Black Diamond suit */
701
0
    c == 0x2667 || /* White Clud suit */
702
0
    c == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */
703
0
    c == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */
704
0
    c == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */
705
0
    c == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */
706
0
    c == 0x1f597 || /* White down pointing left hand index */
707
0
    c == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */
708
0
    c == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */
709
0
    c == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */
710
0
    c == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */
711
0
    c == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */
712
0
    c == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */
713
0
    c == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */
714
0
    c == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */
715
0
    c == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */
716
0
    c == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */
717
0
    c == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */
718
0
    c == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */
719
0
    c == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */
720
0
    c == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */
721
0
    c == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */
722
0
    c == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */
723
0
    (c >= '0' && c <= '9') ||
724
0
    0);
725
0
}
726
727
static void
728
fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags)
729
0
{
730
0
  fz_stext_page *page = dev->page;
731
0
  fz_stext_block *cur_block;
732
0
  fz_stext_line *cur_line = NULL;
733
734
0
  int new_para = 0;
735
0
  int new_line = 1;
736
0
  int add_space = 0;
737
0
  fz_point dir, ndir, p, q;
738
0
  float size;
739
0
  fz_point delta;
740
0
  float spacing = 0;
741
0
  float base_offset = 0;
742
0
  float dist;
743
744
  /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */
745
0
  bidi = bidi & 1;
746
747
  /* dir = direction vector for motion. ndir = normalised(dir) */
748
0
  if (wmode == 0)
749
0
  {
750
0
    dir.x = 1;
751
0
    dir.y = 0;
752
0
  }
753
0
  else
754
0
  {
755
0
    dir.x = 0;
756
0
    dir.y = -1;
757
0
  }
758
0
  dir = fz_transform_vector(dir, trm);
759
0
  ndir = fz_normalize_vector(dir);
760
761
0
  size = fz_matrix_expansion(trm);
762
763
  /* We need to identify where glyphs 'start' (p) and 'stop' (q).
764
   * Each glyph holds its 'start' position, and the next glyph in the
765
   * span (or span->max if there is no next glyph) holds its 'end'
766
   * position.
767
   *
768
   * For both horizontal and vertical motion, trm->{e,f} gives the
769
   * origin (usually the bottom left) of the glyph.
770
   *
771
   * In horizontal mode:
772
   *   + p is bottom left.
773
   *   + q is the bottom right
774
   * In vertical mode:
775
   *   + p is top left (where it advanced from)
776
   *   + q is bottom left
777
   */
778
0
  if (wmode == 0)
779
0
  {
780
0
    p.x = trm.e;
781
0
    p.y = trm.f;
782
0
    q.x = trm.e + adv * dir.x;
783
0
    q.y = trm.f + adv * dir.y;
784
0
  }
785
0
  else
786
0
  {
787
0
    p.x = trm.e - adv * dir.x;
788
0
    p.y = trm.f - adv * dir.y;
789
0
    q.x = trm.e;
790
0
    q.y = trm.f;
791
0
  }
792
793
  //printf("%g,%g \"%c\" %g,%g\n", p.x, p.y, c, q.x, q.y);
794
795
0
  if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0)
796
0
  {
797
0
    if (glyph < 0)
798
0
    {
799
0
      if (dev->last_was_fake_bold)
800
0
        return;
801
0
    }
802
0
    else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags))
803
0
    {
804
0
      dev->last_was_fake_bold = 1;
805
0
      return;
806
0
    }
807
0
    dev->last_was_fake_bold = 0;
808
0
  }
809
810
  /* Find current position to enter new text. */
811
0
  cur_block = page->last_struct ? page->last_struct->last_block : page->last_block;
812
0
  if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
813
0
    cur_block = NULL;
814
0
  cur_line = cur_block ? cur_block->u.t.last_line : NULL;
815
816
  /* We use glyph == -2 to indicate the first no-glyph char from an actualtext. The position
817
   * is valid though, so we want to advance the pen for these. */
818
819
  /* Don't advance pen or break lines for either no-glyph or marking non-spacing characters in a cluster */
820
0
  if (cur_line && (glyph == -1 || ucdn_get_general_category(c) == UCDN_GENERAL_CATEGORY_MN))
821
0
  {
822
0
    add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags);
823
0
    dev->lastbidi = bidi;
824
0
    dev->lastchar = c;
825
0
    dev->lastline = cur_line;
826
0
    return;
827
0
  }
828
829
0
  if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
830
0
  {
831
    /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
832
     * then we can't append to the current block/line. */
833
0
    new_para = 1;
834
0
    new_line = 1;
835
0
  }
836
0
  else
837
0
  {
838
    /* Detect fake bold where text is printed twice in the same place. */
839
    /* Largely supplanted by the check_for_fake_bold mechanism above,
840
     * but we leave this in for backward compatibility as it's cheap,
841
     * and works even when FZ_STEXT_COLLECT_STYLES is not set. */
842
0
    dist = hypotf(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y) / size;
843
    /* This can trigger improperly for glyphs that come from actualtext
844
     * as they are frequently overlaid. Therefore rely on glyph >= 0. */
845
0
    if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar && glyph >= 0)
846
0
      return;
847
848
    /* Calculate how far we've moved since the last character. */
849
0
    delta.x = p.x - dev->pen.x;
850
0
    delta.y = p.y - dev->pen.y;
851
852
    /* The transform has not changed, so we know we're in the same
853
     * direction. Calculate 2 distances; how far off the previous
854
     * baseline we are, together with how far along the baseline
855
     * we are from the expected position. */
856
0
    spacing = (ndir.x * delta.x + ndir.y * delta.y) / size;
857
0
    base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size;
858
859
    /* Only a small amount off the baseline - we'll take this */
860
0
    if (fabsf(base_offset) < BASE_MAX_DIST)
861
0
    {
862
      /* If mixed LTR and RTL content */
863
0
      if ((bidi & 1) != (dev->lastbidi & 1))
864
0
      {
865
        /* Ignore jumps within line when switching between LTR and RTL text. */
866
0
        new_line = 0;
867
0
      }
868
869
      /* RTL */
870
0
      else if (bidi & 1)
871
0
      {
872
0
        fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y);
873
0
        float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv;
874
875
        /* If the pen is where we would have been if we
876
         * had advanced backwards from the previous
877
         * character by this character's advance, we
878
         * are probably seeing characters emitted in
879
         * logical order.
880
         */
881
0
        if (fabsf(logical_spacing) < SPACE_DIST)
882
0
        {
883
0
          new_line = 0;
884
0
        }
885
886
        /* However, if the pen has advanced to where we would expect it
887
         * in an LTR context, we're seeing them emitted in visual order
888
         * and should flag them for reordering!
889
         */
890
0
        else if (fabsf(spacing) < SPACE_DIST)
891
0
        {
892
0
          bidi = 3; /* mark line as visual */
893
0
          new_line = 0;
894
0
        }
895
896
        /* And any other small jump could be a missing space. */
897
0
        else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST)
898
0
        {
899
0
          if (wmode == 0 && may_add_space(dev->lastchar))
900
0
            add_space = 1;
901
0
          new_line = 0;
902
0
        }
903
0
        else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
904
0
        {
905
          /* Motion is in line, but negative. We've probably got overlapping
906
           * chars here. Live with it. */
907
0
          new_line = 0;
908
0
        }
909
0
        else if (spacing > 0 && spacing < SPACE_MAX_DIST)
910
0
        {
911
0
          bidi = 3; /* mark line as visual */
912
0
          if (wmode == 0 && may_add_space(dev->lastchar))
913
0
            add_space = 1 + (spacing > SPACE_DIST*2);
914
0
          new_line = 0;
915
0
        }
916
917
0
        else
918
0
        {
919
          /* Motion is large and unexpected (probably a new table column). */
920
0
          new_line = 1;
921
0
        }
922
0
      }
923
924
      /* LTR or neutral character */
925
0
      else
926
0
      {
927
0
        if (fabsf(spacing) < SPACE_DIST)
928
0
        {
929
          /* Motion is in line and small enough to ignore. */
930
0
          new_line = 0;
931
0
        }
932
0
        else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
933
0
        {
934
          /* Motion is in line, but negative. We've probably got overlapping
935
           * chars here. Live with it. */
936
0
          new_line = 0;
937
0
        }
938
0
        else if (spacing > 0 && spacing < SPACE_MAX_DIST)
939
0
        {
940
          /* Motion is forward in line and large enough to warrant us adding a space. */
941
0
          if (wmode == 0 && may_add_space(dev->lastchar))
942
0
            add_space = 1 + (spacing > SPACE_DIST*2);
943
0
          new_line = 0;
944
0
        }
945
0
        else
946
0
        {
947
          /* Motion is large and unexpected (probably a new table column). */
948
0
          new_line = 1;
949
0
        }
950
0
      }
951
0
    }
952
953
    /* Enough for a new line, but not enough for a new paragraph */
954
0
    else if (fabsf(base_offset) <= PARAGRAPH_DIST)
955
0
    {
956
      /* Check indent to spot text-indent style paragraphs */
957
0
      if (wmode == 0 && cur_line)
958
0
        if ((p.x - dev->start.x) > 0.5f && !dev->maybe_bullet)
959
0
          new_para = 1;
960
0
      new_line = 1;
961
0
    }
962
963
    /* Way off the baseline - open a new paragraph */
964
0
    else
965
0
    {
966
0
      new_para = 1;
967
0
      new_line = 1;
968
0
    }
969
0
  }
970
971
  /* Start a new block (but only at the beginning of a text object) */
972
0
  if (new_para || !cur_block)
973
0
  {
974
0
    flush_lazy_vectors(ctx, page, dev);
975
0
    cur_block = add_text_block_to_page(ctx, page, dev->id);
976
0
    cur_line = cur_block->u.t.last_line;
977
0
  }
978
979
0
  if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(dev->lastchar) && dev->lastline != NULL)
980
0
    dev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED;
981
982
  /* Start a new line */
983
0
  if (new_line || !cur_line || force_new_line)
984
0
  {
985
0
    cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi);
986
0
    dev->start = p;
987
0
    if (glyph == -2)
988
0
      dev->maybe_bullet = 1;
989
0
    else
990
0
      dev->maybe_bullet = plausible_bullet(c);
991
0
  }
992
993
  /* Henceforth treat such non-glyphs in the usual way. */
994
0
  if (glyph == -2)
995
0
    glyph = -1;
996
997
  /* Add synthetic space */
998
0
  if (c != ' ' && add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
999
0
    add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, add_space, flags, dev->flags);
1000
1001
0
  add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags);
1002
1003
0
  dev->lastchar = c;
1004
0
  dev->lastbidi = bidi;
1005
0
  dev->lastline = cur_line;
1006
0
  dev->lag_pen = p;
1007
0
  dev->pen = q;
1008
1009
0
  dev->trm = trm;
1010
0
}
1011
1012
static void
1013
fz_add_stext_char(fz_context *ctx,
1014
  fz_stext_device *dev,
1015
  fz_font *font,
1016
  int c,
1017
  int glyph,
1018
  fz_matrix trm,
1019
  float adv,
1020
  int wmode,
1021
  int bidi,
1022
  int force_new_line,
1023
  int flags)
1024
0
{
1025
  /* ignore when one unicode character maps to multiple glyphs */
1026
0
  if (c == -1)
1027
0
    return;
1028
1029
0
  if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS)
1030
0
    fz_calculate_font_ascender_descender(ctx, font);
1031
1032
0
  if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
1033
0
  {
1034
0
    switch (c)
1035
0
    {
1036
0
    case 0xFB00: /* ff */
1037
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1038
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
1039
0
      return;
1040
0
    case 0xFB01: /* fi */
1041
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1042
0
      fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
1043
0
      return;
1044
0
    case 0xFB02: /* fl */
1045
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1046
0
      fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
1047
0
      return;
1048
0
    case 0xFB03: /* ffi */
1049
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1050
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
1051
0
      fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
1052
0
      return;
1053
0
    case 0xFB04: /* ffl */
1054
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1055
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
1056
0
      fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
1057
0
      return;
1058
0
    case 0xFB05: /* long st */
1059
0
    case 0xFB06: /* st */
1060
0
      fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1061
0
      fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags);
1062
0
      return;
1063
0
    }
1064
1065
    /* alphabetic and arabic presentation forms */
1066
0
    if ((c >= 0xfb00 && c <= 0xfdff) || (c >= 0xfe70 && c <= 0xfefc))
1067
0
    {
1068
0
      uint32_t lig[18];
1069
0
      int i, n = ucdn_compat_decompose(c, lig);
1070
0
      fz_add_stext_char_imp(ctx, dev, font, lig[0], glyph, trm, adv, wmode, bidi, force_new_line, flags);
1071
0
      for (i = 1; i < n; ++i)
1072
0
        fz_add_stext_char_imp(ctx, dev, font, lig[i], -1, trm, 0, wmode, bidi, 0, flags);
1073
0
      return;
1074
0
    }
1075
0
  }
1076
1077
0
  if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
1078
0
  {
1079
0
    switch (c)
1080
0
    {
1081
0
    case 0x0009: /* tab */
1082
0
    case 0x0020: /* space */
1083
0
    case 0x00A0: /* no-break space */
1084
0
    case 0x1680: /* ogham space mark */
1085
0
    case 0x180E: /* mongolian vowel separator */
1086
0
    case 0x2000: /* en quad */
1087
0
    case 0x2001: /* em quad */
1088
0
    case 0x2002: /* en space */
1089
0
    case 0x2003: /* em space */
1090
0
    case 0x2004: /* three-per-em space */
1091
0
    case 0x2005: /* four-per-em space */
1092
0
    case 0x2006: /* six-per-em space */
1093
0
    case 0x2007: /* figure space */
1094
0
    case 0x2008: /* punctuation space */
1095
0
    case 0x2009: /* thin space */
1096
0
    case 0x200A: /* hair space */
1097
0
    case 0x202F: /* narrow no-break space */
1098
0
    case 0x205F: /* medium mathematical space */
1099
0
    case 0x3000: /* ideographic space */
1100
0
      c = ' ';
1101
0
    }
1102
0
  }
1103
1104
0
  fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags);
1105
0
}
1106
1107
static fz_rect
1108
current_clip(fz_context *ctx, fz_stext_device *dev)
1109
0
{
1110
0
  fz_rect r = fz_infinite_rect;
1111
1112
0
  if (dev->flags & FZ_STEXT_CLIP)
1113
0
  {
1114
0
    r = fz_device_current_scissor(ctx, &dev->super);
1115
0
    r = fz_intersect_rect(r, dev->page->mediabox);
1116
0
  }
1117
0
  if (dev->flags & FZ_STEXT_CLIP_RECT)
1118
0
    r = fz_intersect_rect(r, dev->opts.clip);
1119
1120
0
  return r;
1121
0
}
1122
1123
static void
1124
do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags)
1125
0
{
1126
0
  fz_font *font = span->font;
1127
0
  fz_matrix tm = span->trm;
1128
0
  float adv;
1129
0
  int unicode;
1130
0
  int i;
1131
1132
0
  for (i = start; i < end; i++)
1133
0
  {
1134
0
    if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1135
0
    {
1136
0
      fz_rect r = current_clip(ctx, dev);
1137
0
      if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
1138
0
      {
1139
0
        dev->last.clipped = 1;
1140
0
        continue;
1141
0
      }
1142
0
    }
1143
0
    dev->last.clipped = 0;
1144
1145
    /* Calculate new pen location and delta */
1146
0
    tm.e = span->items[i].x;
1147
0
    tm.f = span->items[i].y;
1148
0
    dev->last.trm = fz_concat(tm, ctm);
1149
0
    dev->last.bidi_level = span->bidi_level;
1150
0
    dev->last.wmode = span->wmode;
1151
0
    if (font != dev->last.font)
1152
0
    {
1153
0
      fz_drop_font(ctx, dev->last.font);
1154
0
      dev->last.font = fz_keep_font(ctx, font);
1155
0
    }
1156
0
    dev->last.valid = 1;
1157
0
    dev->last.flags = flags;
1158
1159
    /* Calculate bounding box and new pen position based on font metrics */
1160
0
    if (span->items[i].gid >= 0)
1161
0
      adv = span->items[i].adv;
1162
0
    else
1163
0
      adv = 0;
1164
1165
0
    unicode = span->items[i].ucs;
1166
0
    if (unicode == FZ_REPLACEMENT_CHARACTER)
1167
0
    {
1168
0
      if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE)
1169
0
      {
1170
0
        unicode = span->items[i].cid;
1171
0
        flags |= FZ_STEXT_UNICODE_IS_CID;
1172
0
      }
1173
0
      else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE)
1174
0
      {
1175
0
        unicode = span->items[i].gid;
1176
0
        flags |= FZ_STEXT_UNICODE_IS_GID;
1177
0
      }
1178
0
    }
1179
1180
    /* Send the chars we have through. */
1181
0
    fz_add_stext_char(ctx, dev, font,
1182
0
      unicode,
1183
0
      span->items[i].gid,
1184
0
      dev->last.trm,
1185
0
      adv,
1186
0
      dev->last.wmode,
1187
0
      dev->last.bidi_level,
1188
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1189
0
      flags);
1190
0
  }
1191
0
}
1192
1193
static int
1194
rune_index(const char *utf8, size_t idx)
1195
0
{
1196
0
  int rune;
1197
1198
0
  do
1199
0
  {
1200
0
    int len = fz_chartorune(&rune, utf8);
1201
0
    if (rune == 0)
1202
0
      return -1;
1203
0
    utf8 += len;
1204
0
  }
1205
0
  while (idx--);
1206
1207
0
  return rune;
1208
0
}
1209
1210
static void
1211
flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i, int end)
1212
0
{
1213
0
  int glyph = -2;
1214
1215
0
  if (*actualtext == 0)
1216
0
    return;
1217
1218
0
  if (!dev->last.valid)
1219
0
    return;
1220
1221
0
  if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1222
0
    if (dev->last.clipped)
1223
0
      return;
1224
1225
0
  while (end < 0 || (end >= 0 && i < end))
1226
0
  {
1227
0
    int rune;
1228
0
    actualtext += fz_chartorune(&rune, actualtext);
1229
1230
0
    if (rune == 0)
1231
0
      break;
1232
1233
0
    fz_add_stext_char(ctx, dev, dev->last.font,
1234
0
      rune,
1235
0
      glyph,
1236
0
      dev->last.trm,
1237
0
      0,
1238
0
      dev->last.wmode,
1239
0
      dev->last.bidi_level,
1240
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1241
0
      dev->last.flags);
1242
0
    i++;
1243
1244
0
    glyph = -1; /* -1 for all but first glyph in the actualtext run */
1245
0
  }
1246
0
}
1247
1248
static void
1249
do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags)
1250
0
{
1251
  /* We are within an actualtext block. This means we can't just add the chars
1252
   * as they are. We need to add the chars as they are meant to be. Sadly the
1253
   * actualtext mechanism doesn't help us at all with positioning. */
1254
0
  fz_font *font = span->font;
1255
0
  fz_matrix tm = span->trm;
1256
0
  float adv;
1257
0
  int start, i, end;
1258
0
  char *actualtext = mt->text;
1259
0
  size_t z = fz_utflen(actualtext);
1260
1261
  /* If actualtext is empty, nothing to do! */
1262
0
  if (z == 0)
1263
0
    return;
1264
1265
  /* Now, we HOPE that the creator of a PDF will minimise the actual text
1266
   * differences, so that we'll get:
1267
   *   "Politicians <ActualText="lie">fib</ActualText>, always."
1268
   * rather than:
1269
   *   "<ActualText="Politicians lie, always">Politicians fib, always.</ActualText>
1270
   * but experience with PDF files tells us that this won't always be the case.
1271
   *
1272
   * We try to minimise the actualtext section here, just in case.
1273
   */
1274
1275
  /* Spot a matching prefix and send it. */
1276
0
  for (start = 0; start < span->len; start++)
1277
0
  {
1278
0
    int rune;
1279
0
    int len = fz_chartorune(&rune, actualtext);
1280
0
    if (span->items[start].ucs != rune || rune == 0)
1281
0
      break;
1282
0
    actualtext += len; z--;
1283
0
  }
1284
0
  if (start != 0)
1285
0
    do_extract(ctx, dev, span, ctm, 0, start, flags);
1286
1287
0
  if (start == span->len)
1288
0
  {
1289
    /* The prefix has consumed all this object. Just shorten the actualtext and we'll
1290
     * catch the rest next time. */
1291
0
    z = strlen(actualtext)+1;
1292
0
    memmove(mt->text, actualtext, z);
1293
0
    return;
1294
0
  }
1295
1296
  /* We haven't consumed the whole string, so there must be runes left.
1297
   * Shut coverity up. */
1298
0
  assert(z != 0);
1299
1300
  /* Spot a matching postfix. Can't send it til the end. */
1301
0
  for (end = span->len; end > start; end--)
1302
0
  {
1303
    /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */
1304
0
    int rune = rune_index(actualtext, z-1);
1305
0
    if (span->items[end-1].ucs != rune)
1306
0
      break;
1307
0
    z--;
1308
0
  }
1309
  /* So we can send end -> span->len at the end. */
1310
1311
  /* So we have at least SOME chars that don't match. */
1312
  /* Now, do the difficult bit in the middle.*/
1313
  /* items[start..end] have to be sent with actualtext[start..z] */
1314
0
  for (i = start; i < end; i++)
1315
0
  {
1316
0
    fz_text_item *item = &span->items[i];
1317
0
    int rune = -1;
1318
1319
0
    if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1320
0
    {
1321
0
      fz_rect r = current_clip(ctx, dev);
1322
0
      if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
1323
0
      {
1324
0
        dev->last.clipped = 1;
1325
0
        continue;
1326
0
      }
1327
0
    }
1328
0
    dev->last.clipped = 0;
1329
1330
0
    if ((size_t)i < z)
1331
0
      actualtext += fz_chartorune(&rune, actualtext);
1332
1333
    /* Calculate new pen location and delta */
1334
0
    tm.e = item->x;
1335
0
    tm.f = item->y;
1336
0
    dev->last.trm = fz_concat(tm, ctm);
1337
0
    dev->last.bidi_level = span->bidi_level;
1338
0
    dev->last.wmode = span->wmode;
1339
0
    if (font != dev->last.font)
1340
0
    {
1341
0
      fz_drop_font(ctx, dev->last.font);
1342
0
      dev->last.font = fz_keep_font(ctx, font);
1343
0
    }
1344
0
    dev->last.valid = 1;
1345
0
    dev->last.flags = flags;
1346
1347
    /* Calculate bounding box and new pen position based on font metrics */
1348
0
    if (item->gid >= 0)
1349
0
      adv = item->adv;
1350
0
    else
1351
0
      adv = 0;
1352
1353
0
    fz_add_stext_char(ctx, dev, font,
1354
0
      rune,
1355
0
      span->items[i].gid,
1356
0
      dev->last.trm,
1357
0
      adv,
1358
0
      dev->last.wmode,
1359
0
      dev->last.bidi_level,
1360
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1361
0
      flags);
1362
0
  }
1363
1364
  /* If we haven't spotted a postfix by this point, then don't force ourselves to output
1365
   * any more of the actualtext at this point. We might get a new text object that matches
1366
   * more of it. */
1367
0
  if (end == span->len)
1368
0
  {
1369
    /* Shorten actualtext and exit. */
1370
0
    z = strlen(actualtext)+1;
1371
0
    memmove(mt->text, actualtext, z);
1372
0
    return;
1373
0
  }
1374
1375
  /* if this is the first text on the page, and the actual text suffix matches the entire
1376
   * span text, then no font will have been set above, so set the last used font to the
1377
   * span font since flush_actualtext() assumes that a font has been set.
1378
   */
1379
0
  if (!dev->last.font)
1380
0
    dev->last.font = fz_keep_font(ctx, font);
1381
1382
  /* We found a matching postfix. It seems likely that this is going to be the only
1383
   * text object we get, so send any remaining actualtext now. */
1384
0
  flush_actualtext(ctx, dev, actualtext, i, i + (int)strlen(actualtext) - (span->len - end));
1385
1386
  /* Send the postfix */
1387
0
  if (end != span->len)
1388
0
    do_extract(ctx, dev, span, ctm, end, span->len, flags);
1389
1390
0
  mt->text[0] = 0;
1391
0
}
1392
1393
static void
1394
fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags)
1395
0
{
1396
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1397
0
  metatext_t *mt = NULL;
1398
1399
0
  if (span->len == 0)
1400
0
    return;
1401
1402
  /* Are we in an actualtext? */
1403
0
  if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT))
1404
0
    mt = find_actualtext(dev);
1405
1406
0
  if (mt)
1407
0
    do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags);
1408
0
  else
1409
0
    do_extract(ctx, dev, span, ctm, 0, span->len, flags);
1410
0
}
1411
1412
static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha)
1413
0
{
1414
0
  float rgb[3];
1415
0
  fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
1416
0
  return
1417
0
    (((uint32_t) fz_clampi(alpha * 255 + 0.5f, 0, 255)) << 24) |
1418
0
    (((uint32_t) fz_clampi(rgb[0] * 255 + 0.5f, 0, 255)) << 16) |
1419
0
    (((uint32_t) fz_clampi(rgb[1] * 255 + 0.5f, 0, 255)) << 8) |
1420
0
    (((uint32_t) fz_clampi(rgb[2] * 255 + 0.5f, 0, 255)));
1421
0
}
1422
1423
static void
1424
fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
1425
  fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1426
0
{
1427
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1428
0
  fz_text_span *span;
1429
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1430
0
    return;
1431
0
  tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1432
0
  for (span = text->head; span; span = span->next)
1433
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED);
1434
0
  fz_drop_text(ctx, tdev->lasttext);
1435
0
  tdev->lasttext = fz_keep_text(ctx, text);
1436
0
}
1437
1438
static void
1439
fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
1440
  fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1441
0
{
1442
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1443
0
  fz_text_span *span;
1444
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1445
0
    return;
1446
0
  tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1447
0
  for (span = text->head; span; span = span->next)
1448
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED);
1449
0
  fz_drop_text(ctx, tdev->lasttext);
1450
0
  tdev->lasttext = fz_keep_text(ctx, text);
1451
0
}
1452
1453
static void
1454
fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
1455
0
{
1456
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1457
0
  fz_text_span *span;
1458
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1459
0
    return;
1460
0
  tdev->color = 0;
1461
0
  for (span = text->head; span; span = span->next)
1462
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED);
1463
0
  fz_drop_text(ctx, tdev->lasttext);
1464
0
  tdev->lasttext = fz_keep_text(ctx, text);
1465
0
}
1466
1467
static void
1468
fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
1469
0
{
1470
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1471
0
  fz_text_span *span;
1472
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1473
0
    return;
1474
0
  tdev->color = 0;
1475
0
  for (span = text->head; span; span = span->next)
1476
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED);
1477
0
  fz_drop_text(ctx, tdev->lasttext);
1478
0
  tdev->lasttext = fz_keep_text(ctx, text);
1479
0
}
1480
1481
static void
1482
fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
1483
0
{
1484
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1485
0
  fz_text_span *span;
1486
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1487
0
    return;
1488
0
  tdev->color = 0;
1489
0
  for (span = text->head; span; span = span->next)
1490
0
    fz_stext_extract(ctx, tdev, span, ctm, 0);
1491
0
  fz_drop_text(ctx, tdev->lasttext);
1492
0
  tdev->lasttext = fz_keep_text(ctx, text);
1493
0
}
1494
1495
static void
1496
fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text)
1497
0
{
1498
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1499
0
  metatext_t *mt = find_actualtext(tdev);
1500
0
  char *new_text = NULL;
1501
1502
0
  if (mt != NULL && meta == FZ_METATEXT_ACTUALTEXT)
1503
0
    flush_actualtext(ctx, tdev, mt->text, 0, -1);
1504
1505
0
  if (meta == FZ_METATEXT_ACTUALTEXT)
1506
0
    tdev->last.valid = 0;
1507
1508
0
  new_text = text ? fz_strdup(ctx, text) : NULL;
1509
1510
0
  fz_try(ctx)
1511
0
  {
1512
0
    mt = fz_malloc_struct(ctx, metatext_t);
1513
1514
0
    mt->prev = tdev->metatext;
1515
0
    tdev->metatext = mt;
1516
0
    mt->type = meta;
1517
0
    mt->text = new_text;
1518
0
    mt->bounds = fz_empty_rect;
1519
0
  }
1520
0
  fz_catch(ctx)
1521
0
  {
1522
0
    fz_free(ctx, new_text);
1523
0
    fz_rethrow(ctx);
1524
0
  }
1525
0
}
1526
1527
static void
1528
pop_metatext(fz_context *ctx, fz_stext_device *dev)
1529
0
{
1530
0
  metatext_t *prev;
1531
0
  fz_rect bounds;
1532
1533
0
  if (!dev->metatext)
1534
0
    return;
1535
1536
0
  prev = dev->metatext->prev;
1537
0
  bounds = dev->metatext->bounds;
1538
0
  fz_free(ctx, dev->metatext->text);
1539
0
  fz_free(ctx, dev->metatext);
1540
0
  dev->metatext = prev;
1541
0
  if (prev)
1542
0
    prev->bounds = fz_union_rect(prev->bounds, bounds);
1543
0
}
1544
1545
static void
1546
fz_stext_end_metatext(fz_context *ctx, fz_device *dev)
1547
0
{
1548
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1549
0
  fz_font *myfont = NULL;
1550
1551
0
  if (!tdev->metatext)
1552
0
    return; /* Mismatched pop. Live with it. */
1553
1554
0
  if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT || (tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0)
1555
0
  {
1556
    /* We only deal with ActualText here. Just pop anything else off,
1557
     * and we're done. */
1558
0
    pop_metatext(ctx, tdev);
1559
0
    return;
1560
0
  }
1561
1562
  /* If we have a 'last' text position, send the content after that. */
1563
0
  if (tdev->last.valid)
1564
0
  {
1565
0
    tdev->last.trm.e = tdev->pen.x;
1566
0
    tdev->last.trm.f = tdev->pen.y;
1567
1568
0
    flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1);
1569
0
    pop_metatext(ctx, tdev);
1570
0
    tdev->last.valid = 0;
1571
0
    return;
1572
0
  }
1573
1574
  /* Unless we have collected a rectangle for content that encloses the actual text,
1575
   * we can't do anything. */
1576
0
  if (fz_is_empty_rect(tdev->metatext->bounds))
1577
0
  {
1578
0
    if ((dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) == 0 && tdev->metatext->text[0])
1579
0
      fz_warn(ctx, "ActualText with no position. Text may be lost or mispositioned.");
1580
0
    pop_metatext(ctx, tdev);
1581
0
    return;
1582
0
  }
1583
1584
  /* We have a rectangle, so send the text to fill that. */
1585
0
  tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0;
1586
0
  tdev->last.trm.b = 0;
1587
0
  tdev->last.trm.c = 0;
1588
0
  tdev->last.trm.d = tdev->metatext->bounds.y0 - tdev->metatext->bounds.y1;
1589
0
  tdev->last.trm.e = tdev->metatext->bounds.x0;
1590
0
  tdev->last.trm.f = tdev->metatext->bounds.y1;
1591
0
  tdev->last.valid = 1;
1592
1593
0
  fz_var(myfont);
1594
1595
0
  fz_try(ctx)
1596
0
  {
1597
0
    if (tdev->last.font == NULL)
1598
0
    {
1599
0
      myfont = fz_new_base14_font(ctx, "Helvetica");
1600
0
      tdev->last.font = myfont;
1601
0
    }
1602
0
    flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1);
1603
0
    pop_metatext(ctx, tdev);
1604
0
  }
1605
0
  fz_always(ctx)
1606
0
  {
1607
0
    if (myfont)
1608
0
    {
1609
0
      tdev->last.font = NULL;
1610
0
      fz_drop_font(ctx, myfont);
1611
0
    }
1612
0
  }
1613
0
  fz_catch(ctx)
1614
0
    fz_rethrow(ctx);
1615
0
}
1616
1617
1618
/* Images and shadings */
1619
1620
static void
1621
fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
1622
0
{
1623
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1624
0
  fz_rect *bounds = actualtext_bounds(tdev);
1625
1626
  /* If there is an actualtext in force, update its bounds. */
1627
0
  if (bounds)
1628
0
  {
1629
0
    static const fz_rect unit = { 0, 0, 1, 1 };
1630
0
    *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm));
1631
0
  }
1632
1633
  /* Unless we are being told to preserve images, nothing to do here. */
1634
0
  if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1635
0
    return;
1636
1637
  /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
1638
0
  if (alpha >= 0.5f)
1639
0
  {
1640
0
    fz_stext_block *block;
1641
0
    flush_lazy_vectors(ctx, tdev->page, tdev);
1642
0
    block = add_image_block_to_page(ctx, tdev->page, ctm, img, tdev->id);
1643
0
    if (tdev->opts.flags & FZ_STEXT_CLIP)
1644
0
    {
1645
0
      fz_rect clip = fz_device_current_scissor(ctx, dev);
1646
0
      clip = fz_intersect_rect(clip, tdev->page->mediabox);
1647
0
      block->bbox = fz_intersect_rect(block->bbox, clip);
1648
0
    }
1649
0
  }
1650
0
}
1651
1652
static void
1653
fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
1654
    fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
1655
0
{
1656
0
  fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
1657
0
}
1658
1659
static fz_image *
1660
fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
1661
0
{
1662
0
  fz_matrix ctm = *in_out_ctm;
1663
0
  fz_pixmap *pix;
1664
0
  fz_image *img = NULL;
1665
0
  fz_rect bounds;
1666
0
  fz_irect bbox;
1667
1668
0
  bounds = fz_bound_shade(ctx, shade, ctm);
1669
0
  bounds = fz_intersect_rect(bounds, scissor);
1670
0
  bbox = fz_irect_from_rect(bounds);
1671
1672
0
  pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
1673
0
  fz_try(ctx)
1674
0
  {
1675
0
    if (shade->use_background)
1676
0
      fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
1677
0
    else
1678
0
      fz_clear_pixmap(ctx, pix);
1679
0
    fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL);
1680
0
    img = fz_new_image_from_pixmap(ctx, pix, NULL);
1681
0
  }
1682
0
  fz_always(ctx)
1683
0
    fz_drop_pixmap(ctx, pix);
1684
0
  fz_catch(ctx)
1685
0
    fz_rethrow(ctx);
1686
1687
0
  in_out_ctm->a = pix->w;
1688
0
  in_out_ctm->b = 0;
1689
0
  in_out_ctm->c = 0;
1690
0
  in_out_ctm->d = pix->h;
1691
0
  in_out_ctm->e = pix->x;
1692
0
  in_out_ctm->f = pix->y;
1693
0
  return img;
1694
0
}
1695
1696
static void
1697
fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
1698
0
{
1699
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1700
0
  fz_matrix local_ctm;
1701
0
  fz_rect scissor;
1702
0
  fz_image *image;
1703
1704
  /* If we aren't preserving images, don't waste time making the shade. */
1705
0
  if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1706
0
  {
1707
    /* But we do still need to handle actualtext bounds. */
1708
0
    fz_rect *bounds = actualtext_bounds(tdev);
1709
0
    if (bounds)
1710
0
      *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm));
1711
0
    return;
1712
0
  }
1713
1714
0
  local_ctm = ctm;
1715
0
  scissor = fz_device_current_scissor(ctx, dev);
1716
0
  if (dev->flags & FZ_STEXT_CLIP_RECT)
1717
0
    scissor = fz_intersect_rect(scissor, tdev->opts.clip);
1718
0
  scissor = fz_intersect_rect(scissor, tdev->page->mediabox);
1719
0
  image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
1720
0
  fz_try(ctx)
1721
0
    fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
1722
0
  fz_always(ctx)
1723
0
    fz_drop_image(ctx, image);
1724
0
  fz_catch(ctx)
1725
0
    fz_rethrow(ctx);
1726
0
}
1727
1728
static void
1729
fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block)
1730
0
{
1731
0
  fz_stext_line *line;
1732
0
  fz_stext_char *ch;
1733
1734
0
  for ( ; block != NULL; block = block->next)
1735
0
  {
1736
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
1737
0
    {
1738
0
      if (block->u.s.down)
1739
0
      {
1740
0
        fz_stext_block *block2;
1741
0
        fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block);
1742
0
        for (block2 = block->u.s.down->first_block; block2 != NULL; block2 = block2->next)
1743
0
        {
1744
0
          block->bbox = fz_union_rect(block->bbox, block2->bbox);
1745
0
        }
1746
0
      }
1747
0
    }
1748
0
    if (block->type != FZ_STEXT_BLOCK_TEXT)
1749
0
      continue;
1750
0
    for (line = block->u.t.first_line; line; line = line->next)
1751
0
    {
1752
0
      int reorder = 0;
1753
0
      for (ch = line->first_char; ch; ch = ch->next)
1754
0
      {
1755
0
        fz_rect ch_box = fz_rect_from_quad(ch->quad);
1756
0
        if (ch == line->first_char)
1757
0
          line->bbox = ch_box;
1758
0
        else
1759
0
          line->bbox = fz_union_rect(line->bbox, ch_box);
1760
0
        if (ch->bidi == 3)
1761
0
          reorder = 1;
1762
0
      }
1763
0
      block->bbox = fz_union_rect(block->bbox, line->bbox);
1764
0
      if (reorder)
1765
0
        reverse_bidi_line(line);
1766
0
    }
1767
0
  }
1768
0
}
1769
1770
static void
1771
advance_to_x(fz_point *a, fz_point b, float x)
1772
0
{
1773
0
  a->y += (b.y - a->y) * (x - a->x) / (b.x - a->x);
1774
0
  a->x = x;
1775
0
}
1776
1777
static void
1778
advance_to_y(fz_point *a, fz_point b, float y)
1779
0
{
1780
0
  a->x += (b.x - a->x) * (y - a->y) / (b.y - a->y);
1781
0
  a->y = y;
1782
0
}
1783
1784
static int
1785
line_crosses_rect(fz_point a, fz_point b, fz_rect r)
1786
0
{
1787
  /* Cope with trivial exclusions */
1788
0
  if (a.x < r.x0 && b.x < r.x0)
1789
0
    return 0;
1790
0
  if (a.x > r.x1 && b.x > r.x1)
1791
0
    return 0;
1792
0
  if (a.y < r.y0 && b.y < r.y0)
1793
0
    return 0;
1794
0
  if (a.y > r.y1 && b.y > r.y1)
1795
0
    return 0;
1796
1797
0
  if (a.x < r.x0)
1798
0
    advance_to_x(&a, b, r.x0);
1799
0
  if (a.x > r.x1)
1800
0
    advance_to_x(&a, b, r.x1);
1801
0
  if (a.y < r.y0)
1802
0
    advance_to_y(&a, b, r.y0);
1803
0
  if (a.y > r.y1)
1804
0
    advance_to_y(&a, b, r.y1);
1805
1806
0
  return fz_is_point_inside_rect(a, r);
1807
0
}
1808
1809
static float
1810
calculate_ascent(fz_point p, fz_point origin, fz_point dir)
1811
0
{
1812
0
  return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x);
1813
0
}
1814
1815
/* Create us a rect from the given quad, but extend it downwards
1816
 * to allow for underlines that pass under the glyphs. */
1817
static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size)
1818
0
{
1819
  /* Consider the two rects from A and g respectively.
1820
   *
1821
   * ul +------+ ur   or
1822
   *    |  /\  |         ul +------+ ur
1823
   *    | /__\ |            | /''\ |
1824
   *    |/    \|            |(    ||
1825
   * ll +------+ lr         | ''''||
1826
   *                        |  ''' | <-expected underline level
1827
   *                     ll +------+ lr
1828
   *
1829
   * So an underline won't cross A's rect, but will cross g's.
1830
   * We want to make a rect that includes a suitable amount of
1831
   * space underneath. The information we have available to us
1832
   * is summed up here:
1833
   *
1834
   *  ul +---------+ ur
1835
   *     |         |
1836
   *     | origin  |
1837
   *     |+----------> dir
1838
   *     |         |
1839
   *  ll +---------+ lr
1840
   *
1841
   * Consider the distance from ul to the line that passes through
1842
   * the origin with direction dir. Similarly, consider the distance
1843
   * from ur to the same line. This can be thought of as the 'ascent'
1844
   * of this character.
1845
   *
1846
   * We'd like the distance from ul to ll to be greater than this, so
1847
   * as to ensure we cover the possible location where an underline
1848
   * might reasonably go.
1849
   *
1850
   * If we have a line (l) through point A with direction vector u,
1851
   * the distance between point P and line(l) is:
1852
   *
1853
   * d(P,l) = || AP x u || / || u ||
1854
   *
1855
   * where x is the cross product.
1856
   *
1857
   * For us, because || dir || = 1:
1858
   *
1859
   * d(ul, origin) = || (origin-ul) x dir ||
1860
   *
1861
   * The cross product is only defined in 3 (or 7!) dimensions, so
1862
   * extend both vectors into 3d by defining a 0 z component.
1863
   *
1864
   * (origin-ul) x dir = [ (origin.y - ul.y) . 0     - 0                 . dir.y ]
1865
   *                     [ 0                 . dir.x - (origin.x - ul.y) . 0     ]
1866
   *                     [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ]
1867
   *
1868
   * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x
1869
   */
1870
0
  float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2;
1871
0
  fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y };
1872
0
  fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y };
1873
0
  float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2;
1874
0
  int neg = 0;
1875
0
  float extra_rise = 0;
1876
1877
  /* Spaces will have 0 ascent. underscores will have small ascent.
1878
   * We want a sane ascent to be able to spot strikeouts, but not
1879
   * so big that it incorporates lines above the text, like borders. */
1880
0
  if (ascent < 0.75*size)
1881
0
    extra_rise = 0.75*size - ascent;
1882
1883
  /* We'd like height to be at least ascent + 1/4 size */
1884
0
  if (height < 0)
1885
0
    neg = 1, height = -height;
1886
0
  if (height < ascent + size * 0.25f)
1887
0
    height = ascent + size * 0.25f;
1888
1889
0
  height -= ascent;
1890
0
  if (neg)
1891
0
    height = -height;
1892
0
  quad.ll.x += - height * dir.y;
1893
0
  quad.ll.y +=   height * dir.x;
1894
0
  quad.lr.x += - height * dir.y;
1895
0
  quad.lr.y +=   height * dir.x;
1896
0
  quad.ul.x -= - extra_rise * dir.y;
1897
0
  quad.ul.y -=   extra_rise * dir.x;
1898
0
  quad.ur.x -= - extra_rise * dir.y;
1899
0
  quad.ur.y -=   extra_rise * dir.x;
1900
1901
0
  return fz_rect_from_quad(quad);
1902
0
}
1903
1904
static int feq(float a,float b)
1905
0
{
1906
0
#define EPSILON 0.00001
1907
0
  a -= b;
1908
0
  if (a < 0)
1909
0
    a = -a;
1910
0
  return a < EPSILON;
1911
0
}
1912
1913
static void
1914
check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir, float thickness)
1915
0
{
1916
0
  for ( ; block; block = block->next)
1917
0
  {
1918
0
    fz_stext_line *line;
1919
1920
0
    if (block->type != FZ_STEXT_BLOCK_TEXT)
1921
0
      continue;
1922
1923
0
    for (line = block->u.t.first_line; line != NULL; line = line->next)
1924
0
    {
1925
0
      fz_stext_char *ch;
1926
1927
0
      if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) &&
1928
0
        (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y)))
1929
0
        continue;
1930
1931
      /* Matching directions... */
1932
1933
      /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
1934
       * chars. - FIXME: Now we do! */
1935
0
      for (ch = line->first_char; ch; ch = ch->next)
1936
0
      {
1937
0
        fz_point up;
1938
0
        float dx, dy, dot;
1939
0
        fz_rect ch_box;
1940
1941
0
        ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size);
1942
1943
0
        if (!line_crosses_rect(from, to, ch_box))
1944
0
          continue;
1945
1946
        /* If the thickness is more than a 1/4 of the size, it's a highlight, not a
1947
         * line! */
1948
0
        if (ch->size < thickness*4)
1949
0
        {
1950
          /* Distinguish from a background fill */
1951
0
          if (thickness <= ch->size*1.5f)
1952
0
            ch->flags |= FZ_STEXT_HIGHLIGHT;
1953
0
          continue;
1954
0
        }
1955
1956
        /* Is this a strikeout or an underline? */
1957
1958
        /* The baseline moves from ch->origin in the direction line->dir */
1959
0
        up.x = line->dir.y;
1960
0
        up.y = -line->dir.x;
1961
1962
        /* How far is our line displaced from the line through the origin? */
1963
0
        dx = from.x - ch->origin.x;
1964
0
        dy = from.y - ch->origin.y;
1965
        /* Dot product with up. up is normalised */
1966
0
        dot = dx * up.x + dy * up.y;
1967
1968
0
        if (dot > 0 && dot <= 0.8f * ch->font->ascender * ch->size)
1969
0
          ch->flags |= FZ_STEXT_STRIKEOUT;
1970
0
        else
1971
0
          ch->flags |= FZ_STEXT_UNDERLINE;
1972
0
      }
1973
0
    }
1974
0
  }
1975
0
}
1976
1977
static void
1978
check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page)
1979
0
{
1980
0
  int i, n = tdev->rect_len;
1981
1982
0
  for (i = 0; i < n; i++)
1983
0
  {
1984
0
    fz_point from = tdev->rects[i].from;
1985
0
    fz_point to = tdev->rects[i].to;
1986
0
    float thickness = tdev->rects[i].thickness;
1987
0
    fz_point dir;
1988
0
    dir.x = to.x - from.x;
1989
0
    dir.y = to.y - from.y;
1990
0
    dir = fz_normalize_vector(dir);
1991
1992
0
    check_strikeout(ctx, page->first_block, from, to, dir, thickness);
1993
0
  }
1994
0
}
1995
1996
static void
1997
fz_stext_close_device(fz_context *ctx, fz_device *dev)
1998
0
{
1999
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2000
0
  fz_stext_page *page = tdev->page;
2001
2002
0
  if ((tdev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(tdev->lastchar) && tdev->lastline != NULL)
2003
0
    tdev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED;
2004
2005
0
  flush_lazy_vectors(ctx, page, tdev);
2006
2007
0
  fixup_bboxes_and_bidi(ctx, page->first_block);
2008
2009
0
  if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES)
2010
0
    check_rects_for_strikeout(ctx, tdev, page);
2011
2012
  /* TODO: smart sorting of blocks and lines in reading order */
2013
  /* TODO: unicode NFC normalization */
2014
2015
0
  if (tdev->opts.flags & FZ_STEXT_SEGMENT)
2016
0
    fz_segment_stext_page(ctx, page);
2017
2018
0
  if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK)
2019
0
    fz_paragraph_break(ctx, page);
2020
2021
0
  if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT)
2022
0
    fz_table_hunt(ctx, page);
2023
0
}
2024
2025
static void
2026
fz_stext_drop_device(fz_context *ctx, fz_device *dev)
2027
0
{
2028
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2029
0
  fz_drop_text(ctx, tdev->lasttext);
2030
0
  fz_drop_font(ctx, tdev->last.font);
2031
0
  while (tdev->metatext)
2032
0
    pop_metatext(ctx, tdev);
2033
2034
0
  fz_free(ctx, tdev->rects);
2035
0
}
2036
2037
static int
2038
val_is_rect(const char *val, fz_rect *rp)
2039
0
{
2040
0
  fz_rect r;
2041
0
  const char *s;
2042
2043
0
  s = strchr(val, ':');
2044
0
  if (s == NULL || s == val)
2045
0
    return 0;
2046
0
  r.x0 = fz_atof(val);
2047
0
  val = s+1;
2048
0
  s = strchr(val, ':');
2049
0
  if (s == NULL || s == val)
2050
0
    return 0;
2051
0
  r.y0 = fz_atof(val);
2052
0
  val = s+1;
2053
0
  s = strchr(val, ':');
2054
0
  if (s == NULL || s == val)
2055
0
    return 0;
2056
0
  r.x1 = fz_atof(val);
2057
0
  val = s+1;
2058
0
  r.y1 = fz_atof(val);
2059
2060
0
  *rp = r;
2061
2062
0
  return 1;
2063
0
}
2064
2065
void fz_init_stext_options(fz_context *ctx, fz_stext_options *opts)
2066
0
{
2067
0
  memset(opts, 0, sizeof *opts);
2068
2069
0
  opts->flags |= FZ_STEXT_CLIP;
2070
0
  opts->scale = 1;
2071
0
}
2072
2073
fz_stext_options *
2074
fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
2075
0
{
2076
0
  fz_options *options = fz_new_options(ctx, string);
2077
0
  fz_try(ctx)
2078
0
  {
2079
0
    fz_init_stext_options(ctx, opts);
2080
0
    fz_apply_stext_options(ctx, opts, options);
2081
0
    fz_throw_on_unused_options(ctx, options, "stext");
2082
0
  }
2083
0
  fz_always(ctx)
2084
0
    fz_drop_options(ctx, options);
2085
0
  fz_catch(ctx)
2086
0
    fz_rethrow(ctx);
2087
0
  return opts;
2088
0
}
2089
2090
#define SETCLEARBOOL(A, B, C) \
2091
0
 (A) = (B) ? ((A) | (C)) : ((A) & ~(C))
2092
2093
void
2094
fz_apply_stext_options(fz_context *ctx, fz_stext_options *opts, fz_options *string)
2095
0
{
2096
0
  const char *val;
2097
0
  float x;
2098
0
  int b;
2099
2100
  /* when adding options, remember to update fz_stext_options_usage above */
2101
2102
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-ligatures", &b))
2103
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_LIGATURES);
2104
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-whitespace", &b))
2105
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_WHITESPACE);
2106
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-images", &b))
2107
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_IMAGES);
2108
0
  if (fz_lookup_option_boolean(ctx, string, "inhibit-spaces", &b))
2109
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_INHIBIT_SPACES);
2110
0
  if (fz_lookup_option_boolean(ctx, string, "dehyphenate", &b))
2111
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_DEHYPHENATE);
2112
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-spans", &b))
2113
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_SPANS);
2114
0
  if (fz_lookup_option_boolean(ctx, string, "structured", &b))
2115
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STRUCTURE);
2116
0
  if (fz_lookup_option_boolean(ctx, string, "use-cid-for-unknown-unicode", &b))
2117
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE);
2118
0
  if (fz_lookup_option_boolean(ctx, string, "use-gid-for-unknown-unicode", &b))
2119
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE);
2120
0
  if (fz_lookup_option_boolean(ctx, string, "accurate-bboxes", &b))
2121
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_BBOXES);
2122
0
  if (fz_lookup_option_boolean(ctx, string, "vectors", &b))
2123
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_VECTORS);
2124
0
  if (fz_lookup_option_boolean(ctx, string, "lazy-vectors", &b))
2125
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_LAZY_VECTORS);
2126
0
  if (fz_lookup_option_boolean(ctx, string, "fuzzy-vectors", &b))
2127
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_FUZZY_VECTORS);
2128
0
  if (fz_lookup_option_boolean(ctx, string, "ignore-actualtext", &b))
2129
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_IGNORE_ACTUALTEXT);
2130
0
  if (fz_lookup_option_boolean(ctx, string, "segment", &b))
2131
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_SEGMENT);
2132
0
  if (fz_lookup_option_boolean(ctx, string, "paragraph-break", &b))
2133
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PARAGRAPH_BREAK);
2134
0
  if (fz_lookup_option_boolean(ctx, string, "table-hunt", &b))
2135
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_TABLE_HUNT);
2136
0
  if (fz_lookup_option_boolean(ctx, string, "collect-styles", &b))
2137
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STYLES);
2138
0
  if (fz_lookup_option_boolean(ctx, string, "accurate-ascenders", &b))
2139
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_ASCENDERS);
2140
0
  if (fz_lookup_option_boolean(ctx, string, "accurate-side-bearings", &b))
2141
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_SIDE_BEARINGS);
2142
2143
0
  if (fz_lookup_option_boolean(ctx, string, "mediabox-clip", &b))
2144
0
  {
2145
0
    fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead.");
2146
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP);
2147
0
  }
2148
0
  if (fz_lookup_option_boolean(ctx, string, "clip", &b))
2149
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP);
2150
2151
0
  if (fz_lookup_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip))
2152
0
    opts->flags |= FZ_STEXT_CLIP_RECT;
2153
2154
0
  if (fz_lookup_option_float(ctx, string, "resolution", &x))
2155
0
    opts->scale = x / 96.0f; /* HTML base resolution is 96ppi */
2156
2157
0
  fz_validate_options(ctx, string, "stext");
2158
0
}
2159
2160
typedef struct
2161
{
2162
  int fail;
2163
  int count;
2164
  fz_point corners[4];
2165
} is_rect_data;
2166
2167
static void
2168
stash_point(is_rect_data *rd, float x, float y)
2169
0
{
2170
0
  if (rd->count > 3)
2171
0
  {
2172
0
    rd->fail = 1;
2173
0
    return;
2174
0
  }
2175
2176
0
  rd->corners[rd->count].x = x;
2177
0
  rd->corners[rd->count].y = y;
2178
0
  rd->count++;
2179
0
}
2180
2181
static void
2182
is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
2183
0
{
2184
0
  is_rect_data *rd = arg;
2185
0
  if (rd->fail)
2186
0
    return;
2187
2188
0
  if (rd->count != 0)
2189
0
  {
2190
0
    rd->fail = 1;
2191
0
    return;
2192
0
  }
2193
0
  stash_point(rd, x, y);
2194
0
}
2195
2196
static void
2197
is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
2198
0
{
2199
0
  is_rect_data *rd = arg;
2200
0
  if (rd->fail)
2201
0
    return;
2202
2203
0
  if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
2204
0
    return;
2205
2206
0
  stash_point(rd, x, y);
2207
0
}
2208
2209
static void
2210
is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
2211
0
{
2212
0
  is_rect_data *rd = arg;
2213
0
  rd->fail = 1;
2214
0
}
2215
2216
static void
2217
is_rect_closepath(fz_context *ctx, void *arg)
2218
0
{
2219
0
  is_rect_data *rd = arg;
2220
0
  if (rd->fail)
2221
0
    return;
2222
0
  if (rd->count == 3)
2223
0
    stash_point(rd, rd->corners[0].x, rd->corners[0].y);
2224
0
  if (rd->count != 4)
2225
0
    rd->fail = 1;
2226
0
}
2227
2228
static int
2229
is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm, fz_rect *r)
2230
0
{
2231
0
  float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
2232
0
  is_rect_data rd = { 0 };
2233
0
  static const fz_path_walker walker =
2234
0
  {
2235
0
    is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
2236
0
  };
2237
0
  int i;
2238
2239
0
  fz_walk_path(ctx, path, &walker, &rd);
2240
2241
0
  if (rd.fail)
2242
0
    return 0;
2243
2244
0
  if (rd.count == 2)
2245
0
  {
2246
0
    stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
2247
0
    stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
2248
0
  }
2249
2250
0
  for (i = 0 ; i < 4; i++)
2251
0
  {
2252
0
    fz_point p = fz_transform_point(rd.corners[i], ctm);
2253
2254
0
    rd.corners[i].x = p.x;
2255
0
    rd.corners[i].y = p.y;
2256
0
  }
2257
2258
  /* So we have a 4 cornered path. Hopefully something like:
2259
   * 0---------1
2260
   * |         |
2261
   * 3---------2
2262
   * but it might be:
2263
   * 0---------3
2264
   * |         |
2265
   * 1---------2
2266
  */
2267
0
  while (1)
2268
0
  {
2269
0
    d01x = rd.corners[1].x - rd.corners[0].x;
2270
0
    d01y = rd.corners[1].y - rd.corners[0].y;
2271
0
    d01 = d01x * d01x + d01y * d01y;
2272
0
    d03x = rd.corners[3].x - rd.corners[0].x;
2273
0
    d03y = rd.corners[3].y - rd.corners[0].y;
2274
0
    d03 = d03x * d03x + d03y * d03y;
2275
0
    if(d01 < d03)
2276
0
    {
2277
      /* We are the latter case. Transpose it. */
2278
0
      fz_point p = rd.corners[1];
2279
0
      rd.corners[1] = rd.corners[3];
2280
0
      rd.corners[3] = p;
2281
0
    }
2282
0
    else
2283
0
      break;
2284
0
  }
2285
0
  d32x = rd.corners[2].x - rd.corners[3].x;
2286
0
  d32y = rd.corners[2].y - rd.corners[3].y;
2287
2288
  /* So d32x and d01x need to be the same for this to be a strikeout. */
2289
0
  if (!feq(d32x, d01x) || !feq(d32y, d01y))
2290
0
    return 0;
2291
2292
  /* We are plausibly a rectangle. */
2293
0
  *thickness = sqrtf(d03x * d03x + d03y * d03y);
2294
2295
0
  from->x = (rd.corners[0].x + rd.corners[3].x)/2;
2296
0
  from->y = (rd.corners[0].y + rd.corners[3].y)/2;
2297
0
  to->x = (rd.corners[1].x + rd.corners[2].x)/2;
2298
0
  to->y = (rd.corners[1].y + rd.corners[2].y)/2;
2299
2300
0
  *r = fz_empty_rect;
2301
0
  if ((rd.corners[0].x == rd.corners[3].x && rd.corners[1].x == rd.corners[2].x &&
2302
0
    rd.corners[0].y == rd.corners[1].y && rd.corners[2].y == rd.corners[3].y) ||
2303
0
    (rd.corners[0].x == rd.corners[1].x && rd.corners[3].x == rd.corners[2].x &&
2304
0
    rd.corners[0].y == rd.corners[3].y && rd.corners[2].y == rd.corners[1].y))
2305
0
  {
2306
0
    *r = fz_include_point_in_rect(*r, rd.corners[0]);
2307
0
    *r = fz_include_point_in_rect(*r, rd.corners[2]);
2308
0
  }
2309
2310
0
  return 1;
2311
0
}
2312
2313
static void
2314
check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm, int argb)
2315
0
{
2316
0
  float thickness;
2317
0
  fz_point from, to;
2318
0
  int i, n = tdev->rect_len;
2319
0
  fz_rect r;
2320
2321
  /* Is this path a thin rectangle (possibly rotated)? If so, then we need to
2322
   * consider it as being a strikeout or underline. */
2323
0
  if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm, &r))
2324
0
    return;
2325
2326
  /* If we've already had a rectangle of the same colour that covers this region
2327
   * then that was probably a cell background color, and this is probably a
2328
   * text string background fill. This is not a highlight, or underline or
2329
   * strikeout, so don't keep it. */
2330
0
  for (i = 0; i < n; i++)
2331
0
  {
2332
0
    rect_details *rct = &tdev->rects[i];
2333
0
    if (rct->argb == argb && fz_contains_rect(rct->rect, r))
2334
0
      return;
2335
0
  }
2336
2337
  /* Add to the list of rects in the device. */
2338
0
  if (tdev->rect_len == tdev->rect_max)
2339
0
  {
2340
0
    int newmax = tdev->rect_max * 2;
2341
0
    if (newmax == 0)
2342
0
      newmax = 32;
2343
2344
0
    tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax);
2345
0
    tdev->rect_max = newmax;
2346
0
  }
2347
0
  tdev->rects[tdev->rect_len].from = from;
2348
0
  tdev->rects[tdev->rect_len].to = to;
2349
0
  tdev->rects[tdev->rect_len].thickness = thickness;
2350
0
  tdev->rects[tdev->rect_len].rect = r;
2351
0
  tdev->rects[tdev->rect_len].argb = argb;
2352
0
  tdev->rect_len++;
2353
0
}
2354
2355
static void
2356
add_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, fz_rect bbox, uint32_t flags, uint32_t argb, int id, float exp)
2357
0
{
2358
0
  fz_stext_block *b;
2359
2360
0
  if (exp != 0)
2361
0
  {
2362
0
    bbox.x0 -= exp;
2363
0
    bbox.y0 -= exp;
2364
0
    bbox.x1 += exp;
2365
0
    bbox.y1 += exp;
2366
0
  }
2367
2368
0
  if (tdev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP))
2369
0
  {
2370
0
    fz_rect r = current_clip(ctx, tdev);
2371
0
    bbox = fz_intersect_rect(bbox, r);
2372
0
    if (!fz_is_valid_rect(bbox))
2373
0
      return;
2374
0
  }
2375
2376
  /* Can we just add this one onto the previous one? */
2377
  /* Only if it's a small rectangle... */
2378
0
  if ((flags & FZ_STEXT_VECTOR_IS_RECTANGLE) && bbox.x1 - bbox.x0 <= 2 && bbox.y1 - bbox.y0 <= 2)
2379
0
  {
2380
0
    fz_stext_block *prev;
2381
    /* Find b = the previous block. */
2382
0
    if (tdev->flags & FZ_STEXT_LAZY_VECTORS)
2383
0
      b = tdev->lazy_vectors_tail;
2384
0
    else if (page->last_struct)
2385
0
      b = page->last_struct->last_block;
2386
0
    else
2387
0
      b = page->last_block;
2388
2389
0
    if (b && b->type == FZ_STEXT_BLOCK_VECTOR && b->u.v.argb == argb && b->u.v.flags == flags)
2390
0
    {
2391
      /* Maybe we can join it? */
2392
0
      float fudge = 0.001f;
2393
0
      if (b->bbox.x0 == bbox.x0 && b->bbox.x1 == bbox.x1 && b->bbox.y1 + fudge >= bbox.y0 && b->bbox.y0 - fudge <= bbox.y1)
2394
0
      {
2395
        /* Stacks vertically. */
2396
0
        b->bbox.y0 = fz_min(b->bbox.y0, bbox.y0);
2397
0
        b->bbox.y1 = fz_max(b->bbox.y1, bbox.y1);
2398
0
        return;
2399
0
      }
2400
0
      else if (b->bbox.y0 == bbox.y0 && b->bbox.y1 == bbox.y1 && b->bbox.x1 + fudge >= bbox.x0 && b->bbox.x0 - fudge <= bbox.x1)
2401
0
      {
2402
        /* Stacks horizontally. */
2403
0
        b->bbox.x0 = fz_min(b->bbox.x0, bbox.x0);
2404
0
        b->bbox.x1 = fz_max(b->bbox.x1, bbox.x1);
2405
0
        return;
2406
0
      }
2407
2408
      /* So, we can't add our new vector onto the previous one. But can we merge the 2 previous ones? */
2409
      /* The intent here is that we allow a set of vector 'blocks' to be merged together, perhaps:
2410
       *    ABC
2411
       * Then we allow another set to be merged together, perhaps DE:
2412
       *    ABC
2413
       *    DE
2414
       * Then when we get another block that can't be merged into DE (perhaps F):
2415
       *    ABC
2416
       *    DE
2417
       *    F
2418
       * We'll consider ABC and DE for merging. Whatevever block that F ends up
2419
       * in later (maybe FGH):
2420
       *    ABC
2421
       *    DE
2422
       *    FGH
2423
       * will be considered for merging later. We can always do this "exactly" (if the blocks
2424
       * line up precisely), but to do this 'lossily', we guard it with 'FUZZY_VECTORS'.
2425
       */
2426
0
      prev = b->prev;
2427
0
      while (prev && prev->type == FZ_STEXT_BLOCK_VECTOR && (prev->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE))
2428
0
      {
2429
        /* Lossless merging. */
2430
0
        if (prev->bbox.x0 == b->bbox.x0 && prev->bbox.x1 == b->bbox.x1 && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1)
2431
0
        {
2432
          /* Stacks exactly vertically. Very rarely hit. */
2433
0
          prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0);
2434
0
          prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1);
2435
0
          return;
2436
0
        }
2437
0
        else if (prev->bbox.y0 == b->bbox.y0 && prev->bbox.y1 == b->bbox.y1 && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1)
2438
0
        {
2439
          /* Stacks horizontally.  Very rarely hit. */
2440
0
          prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0);
2441
0
          prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1);
2442
0
          return;
2443
0
        }
2444
0
        if (tdev->flags & FZ_STEXT_FUZZY_VECTORS)
2445
0
        {
2446
          /* Be more forgiving in how we merge vectors */
2447
          /* We need to be careful not to merge together differently oriented borders for table cells.
2448
           *        C
2449
           *        |
2450
           *        v
2451
           *     +-----+-----+
2452
           * A-> |     |     |
2453
           *     +-----+-----+
2454
           * B-> |     |     |
2455
           *     +-----+-----+
2456
           *
2457
           * It'd be fine to merge borders A and B together, because it still signifies the same
2458
           * edges. It would NOT be fine to merge A and C together, because we'd lose the sense
2459
           * of them being borders, and just have a blob that covered the cell.
2460
           * The fudge2 logic below should hopefully allow for this, as well as allowing us to
2461
           * match blocks like:
2462
           *    ABC
2463
           *   DE FG
2464
           *    HIJ
2465
           *   KL MN
2466
           *    OPQ
2467
           */
2468
0
          float fudge2 = 2;
2469
0
          if ((fabsf(prev->bbox.x0 - b->bbox.x0) <= fudge2 || fabsf(prev->bbox.x1 - b->bbox.x1) <= fudge2) && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1)
2470
0
          {
2471
            /* Stacks vertically. */
2472
0
            goto join;
2473
0
          }
2474
0
          else if ((fabsf(prev->bbox.y0 - b->bbox.y0) <= fudge2 || fabsf(prev->bbox.y1 - b->bbox.y1) <= fudge2) && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1)
2475
0
          {
2476
            /* Stacks horizontally. */
2477
0
  join:
2478
0
            prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0);
2479
0
            prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1);
2480
0
            prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0);
2481
0
            prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1);
2482
            /* Unlink b (so, fiddle with b->prev, which is not necessarily prev!) */
2483
0
            b->prev->next = NULL;
2484
0
            if (tdev->flags & FZ_STEXT_LAZY_VECTORS)
2485
0
              tdev->lazy_vectors_tail = b->prev;
2486
0
            else if (page->last_struct)
2487
0
              page->last_struct->last_block = b->prev;
2488
0
            else
2489
0
              page->last_block = b->prev;
2490
0
            break;
2491
0
          }
2492
0
        }
2493
        /* Now, allow for looking further back. */
2494
0
        prev = prev->prev;
2495
0
      }
2496
0
    }
2497
0
  }
2498
2499
0
  if (tdev->flags & FZ_STEXT_LAZY_VECTORS)
2500
0
    b = add_lazy_vector(ctx, page, tdev, id);
2501
0
  else
2502
0
    b = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_VECTOR, id);
2503
2504
0
  b->bbox = bbox;
2505
0
  b->u.v.flags = flags;
2506
0
  b->u.v.argb = argb;
2507
0
}
2508
2509
typedef struct
2510
{
2511
  fz_stext_device *dev;
2512
  fz_matrix ctm;
2513
  uint32_t argb;
2514
  uint32_t flags;
2515
  fz_stext_page *page;
2516
  fz_rect seg_bounds;
2517
  fz_rect leftovers;
2518
  fz_rect pending;
2519
  int count;
2520
  fz_point p[5];
2521
  int id;
2522
  float exp;
2523
} split_path_data;
2524
2525
static void
2526
maybe_rect(fz_context *ctx, split_path_data *sp)
2527
0
{
2528
0
  int rect = 0;
2529
0
  int i;
2530
0
  fz_rect leftovers;
2531
2532
0
  if (sp->count >= 3)
2533
0
  {
2534
    /* Allow for multiple monotonic points in a horizontal or vertical line,
2535
     * such as seen in borders of tables where each column or row is written
2536
     * individually. (e.g. move 0 0 line 100 0 line 200 0 line 300 0) */
2537
0
    if (feq(sp->p[sp->count-1].x, sp->p[sp->count-2].x) &&
2538
0
      feq(sp->p[sp->count-1].x, sp->p[sp->count-3].x) &&
2539
0
      ((sp->p[sp->count-1].y <= sp->p[sp->count-2].y && sp->p[sp->count-2].y <= sp->p[sp->count-3].y) ||
2540
0
      (sp->p[sp->count-1].y >= sp->p[sp->count-2].y && sp->p[sp->count-2].y >= sp->p[sp->count-3].y)))
2541
0
    {
2542
      /*  y---->y---->y - Remove the central y */
2543
0
      sp->p[sp->count-2].y = sp->p[sp->count-1].y;
2544
0
      sp->count--;
2545
0
    }
2546
0
    else if (feq(sp->p[sp->count-1].y, sp->p[sp->count-2].y) &&
2547
0
      feq(sp->p[sp->count-1].y, sp->p[sp->count-3].y) &&
2548
0
      ((sp->p[sp->count-1].x <= sp->p[sp->count-2].x && sp->p[sp->count-2].x <= sp->p[sp->count-3].x) ||
2549
0
      (sp->p[sp->count-1].x >= sp->p[sp->count-2].x && sp->p[sp->count-2].x >= sp->p[sp->count-3].x)))
2550
0
    {
2551
      /*  x---->x---->x - Remove the central x */
2552
0
      sp->p[sp->count-2].x = sp->p[sp->count-1].x;
2553
0
      sp->count--;
2554
0
    }
2555
0
  }
2556
2557
0
  if (sp->count >= 0)
2558
0
  {
2559
0
    if (sp->count == 3)
2560
0
    {
2561
      /* Allow for "moveto A, lineto B, lineto A, close" */
2562
0
      if (feq(sp->p[0].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[2].y))
2563
0
        sp->count = 2;
2564
0
    }
2565
0
    if (sp->count == 2)
2566
0
    {
2567
0
      if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y))
2568
0
        rect = 1; /* Count that as a rect */
2569
0
    }
2570
0
    else if (sp->count == 4 || sp->count == 5)
2571
0
    {
2572
0
      if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y))
2573
0
        rect = 1;
2574
0
      else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y))
2575
0
        rect = 1;
2576
0
    }
2577
0
    if (rect)
2578
0
    {
2579
0
      fz_rect bounds;
2580
2581
0
      bounds.x0 = bounds.x1 = sp->p[0].x;
2582
0
      bounds.y0 = bounds.y1 = sp->p[0].y;
2583
0
      for (i = 1; i < sp->count; i++)
2584
0
        bounds = fz_include_point_in_rect(bounds, sp->p[i]);
2585
0
      if (fz_is_valid_rect(sp->pending))
2586
0
        add_vector(ctx, sp->page, sp->dev, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb, sp->id, sp->exp);
2587
0
      sp->pending = bounds;
2588
0
      return;
2589
0
    }
2590
0
  }
2591
2592
  /* We aren't a rectangle! */
2593
0
  leftovers = sp->seg_bounds;
2594
2595
0
  if (sp->dev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP))
2596
0
    leftovers = fz_intersect_rect(leftovers, current_clip(ctx, sp->dev));
2597
2598
0
  if (fz_is_valid_rect(leftovers))
2599
0
    sp->leftovers = fz_union_rect(sp->leftovers, leftovers);
2600
2601
  /* Remember we're not a rect. */
2602
0
  sp->count = -1;
2603
0
}
2604
2605
static void
2606
split_move(fz_context *ctx, void *arg, float x, float y)
2607
0
{
2608
0
  split_path_data *sp = (split_path_data *)arg;
2609
0
  fz_point p = fz_transform_point_xy(x, y, sp->ctm);
2610
2611
0
  maybe_rect(ctx, sp);
2612
0
  sp->p[0] = p;
2613
0
  sp->count = 1;
2614
0
  sp->seg_bounds.x0 = sp->seg_bounds.x1 = p.x;
2615
0
  sp->seg_bounds.y0 = sp->seg_bounds.y1 = p.y;
2616
0
}
2617
2618
static void
2619
split_line(fz_context *ctx, void *arg, float x, float y)
2620
0
{
2621
0
  split_path_data *sp = (split_path_data *)arg;
2622
0
  fz_point p = fz_transform_point_xy(x, y, sp->ctm);
2623
2624
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, p);
2625
2626
0
  if (sp->count >= 0)
2627
0
  {
2628
    /* Check for lines to the same point. */
2629
0
    if (sp->count > 0 && feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y))
2630
0
      return;
2631
    /* If we're still maybe a rect, just record the point. */
2632
0
    if (sp->count < 4)
2633
0
    {
2634
0
      sp->p[sp->count++] = p;
2635
0
      return;
2636
0
    }
2637
    /* Check for close line? */
2638
0
    if (sp->count == 4)
2639
0
    {
2640
0
      if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y))
2641
0
      {
2642
        /* We've just drawn a line back to the start point. */
2643
        /* Needless saving of point, but it makes the logic
2644
         * easier elsewhere. */
2645
0
        sp->p[sp->count++] = p;
2646
0
        return;
2647
0
      }
2648
0
    }
2649
    /* We can no longer be a rect. */
2650
0
    sp->count = -1;
2651
0
  }
2652
0
}
2653
2654
static void
2655
split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
2656
0
{
2657
0
  split_path_data *sp = (split_path_data *)arg;
2658
2659
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x1, y1, sp->ctm));
2660
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x2, y2, sp->ctm));
2661
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x3, y3, sp->ctm));
2662
2663
  /* We can no longer be a rect. */
2664
0
  sp->count = -1;
2665
0
}
2666
2667
static void
2668
split_close(fz_context *ctx, void *arg)
2669
0
{
2670
0
  split_path_data *sp = (split_path_data *)arg;
2671
2672
0
  maybe_rect(ctx, sp);
2673
0
  sp->count = 0;
2674
0
}
2675
2676
2677
static const
2678
fz_path_walker split_path_rects =
2679
{
2680
  split_move,
2681
  split_line,
2682
  split_curve,
2683
  split_close
2684
};
2685
2686
static void
2687
add_vectors_from_path(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, const fz_stroke_state *stroke, float exp)
2688
0
{
2689
0
  int have_leftovers;
2690
0
  split_path_data sp;
2691
0
  int id = tdev->id;
2692
0
  int trailing_moves_acceptable = (stroke == NULL || stroke->end_cap != FZ_LINECAP_ROUND);
2693
2694
0
  sp.dev = tdev;
2695
0
  sp.ctm = ctm;
2696
0
  sp.argb = hexrgba_from_color(ctx, cs, color, alpha);
2697
0
  sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0;
2698
0
  sp.page = page;
2699
0
  sp.count = 0;
2700
0
  sp.leftovers = fz_empty_rect;
2701
0
  sp.seg_bounds = fz_empty_rect;
2702
0
  sp.pending = fz_empty_rect;
2703
0
  sp.id = id;
2704
0
  sp.exp = exp;
2705
0
  fz_walk_path(ctx, path, &split_path_rects, &sp);
2706
2707
0
  have_leftovers = fz_is_valid_rect(sp.leftovers);
2708
2709
0
  if (!trailing_moves_acceptable || sp.count != 1)
2710
0
    maybe_rect(ctx, &sp);
2711
2712
0
  if ((!trailing_moves_acceptable || sp.count != 1) && fz_is_valid_rect(sp.pending))
2713
0
    add_vector(ctx, page, sp.dev, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb, id, exp);
2714
0
  if (have_leftovers)
2715
0
    add_vector(ctx, page, sp.dev, sp.leftovers, sp.flags, sp.argb, id, exp);
2716
0
}
2717
2718
static void
2719
fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2720
0
{
2721
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2722
0
  fz_stext_page *page = tdev->page;
2723
0
  fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
2724
0
  fz_rect *bounds = actualtext_bounds(tdev);
2725
2726
  /* If we're in an actualtext, then update the bounds to include this content. */
2727
0
  if (bounds != NULL)
2728
0
    *bounds = fz_union_rect(*bounds, path_bounds);
2729
2730
0
  if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2731
0
    check_for_strikeout(ctx, tdev, page, path, ctm, hexrgba_from_color(ctx, cs, color, alpha));
2732
2733
0
  if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2734
0
    add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, NULL, 0);
2735
0
}
2736
2737
static void
2738
fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2739
0
{
2740
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2741
0
  fz_stext_page *page = tdev->page;
2742
0
  fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm);
2743
0
  fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
2744
0
  float exp = ss->linewidth / 2;
2745
2746
  /* If we're in an actualtext, then update the bounds to include this content. */
2747
0
  if (bounds != NULL)
2748
0
    *bounds = fz_union_rect(*bounds, path_bounds);
2749
2750
0
  if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2751
0
    check_for_strikeout(ctx, tdev, page, path, ctm, hexrgba_from_color(ctx, cs, color, alpha));
2752
2753
0
  if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2754
0
    add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, ss, exp);
2755
0
}
2756
2757
static void
2758
new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
2759
0
{
2760
0
  fz_stext_struct *str;
2761
0
  size_t z;
2762
2763
0
  if (raw == NULL)
2764
0
    raw = "";
2765
0
  z = strlen(raw);
2766
2767
0
  str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1);
2768
0
  str->first_block = NULL;
2769
0
  str->last_block = NULL;
2770
0
  str->standard = standard;
2771
0
  str->parent = page->last_struct;
2772
0
  str->up = block;
2773
0
  memcpy(str->raw, raw, z+1);
2774
2775
0
  block->u.s.down = str;
2776
0
}
2777
2778
fz_stext_block *
2779
fz_new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_structure standard, const char *raw, int idx)
2780
0
{
2781
0
  fz_stext_block *block;
2782
2783
0
  block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
2784
0
  block->bbox = fz_empty_rect;
2785
0
  block->prev = NULL;
2786
0
  block->next = NULL;
2787
0
  block->type = FZ_STEXT_BLOCK_STRUCT;
2788
0
  block->u.s.index = idx;
2789
0
  block->u.s.down = NULL;
2790
  /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
2791
0
  new_stext_struct(ctx, page, block, standard, raw);
2792
2793
0
  return block;
2794
0
}
2795
2796
2797
static void
2798
fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx)
2799
0
{
2800
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2801
0
  fz_stext_page *page = tdev->page;
2802
0
  fz_stext_block *block, *le, *gt, *newblock;
2803
2804
0
  if (raw == NULL)
2805
0
    raw = "";
2806
2807
  /* Find a pointer to the last block. */
2808
0
  if (page->last_block)
2809
0
  {
2810
0
    block = page->last_block;
2811
0
  }
2812
0
  else if (page->last_struct)
2813
0
  {
2814
0
    block = page->last_struct->last_block;
2815
0
  }
2816
0
  else
2817
0
  {
2818
0
    block = page->first_block;
2819
0
  }
2820
2821
  /* So block is somewhere in the content chain. Let's try and find:
2822
   *   le = the struct node <= idx before block in the content chain.
2823
   *   ge = the struct node >= idx after block in the content chain.
2824
   * Search backwards to start with.
2825
   */
2826
0
  gt = NULL;
2827
0
  le = block;
2828
0
  while (le)
2829
0
  {
2830
0
    if (le->type == FZ_STEXT_BLOCK_STRUCT)
2831
0
    {
2832
0
      if (le->u.s.index > idx)
2833
0
        gt = le;
2834
0
      if (le->u.s.index <= idx)
2835
0
        break;
2836
0
    }
2837
0
    le = le->prev;
2838
0
  }
2839
  /* The following loop copes with finding gt (the smallest block with an index higher
2840
   * than we want) if we haven't found it already. The while loop in here was designed
2841
   * to cope with 'block' being in the middle of a list. In fact, the way the code is
2842
   * currently, block will always be at the end of a list, so the while won't do anything.
2843
   * But I'm loathe to remove it in case we ever change this code to start from wherever
2844
   * we did the last insertion. */
2845
0
  if (gt == NULL)
2846
0
  {
2847
0
    gt = block;
2848
0
    while (gt)
2849
0
    {
2850
0
      if (gt->type == FZ_STEXT_BLOCK_STRUCT)
2851
0
      {
2852
0
        if (gt->u.s.index <= idx)
2853
0
          le = gt;
2854
0
        if (gt->u.s.index >= idx)
2855
0
          break;
2856
0
      }
2857
0
      block = gt;
2858
0
      gt = gt->next;
2859
0
    }
2860
0
  }
2861
2862
0
  if (le && le->u.s.index == idx)
2863
0
  {
2864
    /* We want to move down into the le block. Does it have a struct
2865
     * attached yet? */
2866
0
    if (le->u.s.down == NULL)
2867
0
    {
2868
      /* No. We need to create a new struct node. */
2869
0
      new_stext_struct(ctx, page, le, standard, raw);
2870
0
    }
2871
0
    else if (le->u.s.down->standard != standard || strcmp(raw, le->u.s.down->raw) != 0)
2872
0
    {
2873
      /* Yes, but it doesn't match the one we expect! */
2874
0
      fz_warn(ctx, "Mismatched structure type!");
2875
0
    }
2876
0
    page->last_struct = le->u.s.down;
2877
0
    page->last_block = le->u.s.down->last_block;
2878
2879
0
    return;
2880
0
  }
2881
2882
  /* We are going to need to create a new block. Create a complete unlinked one here. */
2883
0
  newblock = fz_new_stext_struct(ctx, page, standard, raw, idx);
2884
2885
  /* So now we just need to link it in somewhere. */
2886
0
  if (gt)
2887
0
  {
2888
    /* Link it in before gt. */
2889
0
    newblock->prev = gt->prev;
2890
0
    if (gt->prev)
2891
0
      gt->prev->next = newblock;
2892
0
    else if (page->last_struct)
2893
0
    {
2894
      /* We're linking it in at the start under another struct! */
2895
0
      assert(page->last_struct->first_block == gt);
2896
0
      assert(page->last_struct->last_block != NULL);
2897
0
      page->last_struct->first_block = newblock;
2898
0
    }
2899
0
    else
2900
0
    {
2901
      /* We're linking it in at the start of the page! */
2902
0
      assert(page->first_block == gt);
2903
0
      page->first_block = newblock;
2904
0
    }
2905
0
    gt->prev = newblock;
2906
0
    newblock->next = gt;
2907
0
    newblock->id = gt->id;
2908
0
  }
2909
0
  else if (block)
2910
0
  {
2911
    /* Link it in at the end of the list (i.e. after 'block') */
2912
0
    newblock->prev = block;
2913
0
    block->next = newblock;
2914
0
    if (page->last_struct)
2915
0
    {
2916
0
      assert(page->last_struct->last_block == block);
2917
0
      page->last_struct->last_block = newblock;
2918
0
    }
2919
0
    else
2920
0
    {
2921
0
      assert(page->last_block == block);
2922
0
      page->last_block = newblock;
2923
0
    }
2924
0
    newblock->id = block->id;
2925
0
  }
2926
0
  else if (page->last_struct)
2927
0
  {
2928
    /* We have no blocks at all at this level. */
2929
0
    page->last_struct->first_block = newblock;
2930
0
    page->last_struct->last_block = newblock;
2931
0
    newblock->id = page->last_struct->up->id;
2932
0
  }
2933
0
  else
2934
0
  {
2935
    /* We have no blocks at ANY level. */
2936
0
    page->first_block = newblock;
2937
    /* newblock will have an id of 0. Best we can do. */
2938
0
  }
2939
  /* Wherever we linked it in, that's where we want to continue adding content. */
2940
0
  page->last_struct = newblock->u.s.down;
2941
0
  page->last_block = NULL;
2942
0
}
2943
2944
static void
2945
fz_stext_end_structure(fz_context *ctx, fz_device *dev)
2946
0
{
2947
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2948
0
  fz_stext_page *page = tdev->page;
2949
0
  fz_stext_struct *str = page->last_struct;
2950
2951
0
  if (str == NULL)
2952
0
  {
2953
0
    fz_warn(ctx, "Structure out of sync");
2954
0
    return;
2955
0
  }
2956
2957
0
  page->last_struct = str->parent;
2958
0
  if (page->last_struct == NULL)
2959
0
  {
2960
0
    page->last_block = page->first_block;
2961
    /* Yuck */
2962
0
    while (page->last_block->next)
2963
0
      page->last_block = page->last_block->next;
2964
0
  }
2965
0
  else
2966
0
  {
2967
0
    page->last_block = page->last_struct->last_block;
2968
0
  }
2969
0
}
2970
2971
fz_device *
2972
fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
2973
0
{
2974
0
  return fz_new_stext_device_for_page(ctx, page, opts, 0, 0, fz_empty_rect);
2975
0
}
2976
2977
fz_device *
2978
fz_new_stext_device_for_page(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts, int chapter_num, int page_num, fz_rect mediabox)
2979
0
{
2980
0
  fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
2981
2982
0
  dev->super.close_device = fz_stext_close_device;
2983
0
  dev->super.drop_device = fz_stext_drop_device;
2984
2985
0
  dev->super.fill_text = fz_stext_fill_text;
2986
0
  dev->super.stroke_text = fz_stext_stroke_text;
2987
0
  dev->super.clip_text = fz_stext_clip_text;
2988
0
  dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
2989
0
  dev->super.ignore_text = fz_stext_ignore_text;
2990
0
  dev->super.begin_metatext = fz_stext_begin_metatext;
2991
0
  dev->super.end_metatext = fz_stext_end_metatext;
2992
2993
0
  dev->super.fill_shade = fz_stext_fill_shade;
2994
0
  dev->super.fill_image = fz_stext_fill_image;
2995
0
  dev->super.fill_image_mask = fz_stext_fill_image_mask;
2996
2997
0
  if (opts)
2998
0
  {
2999
0
    dev->flags = opts->flags;
3000
0
    if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE)
3001
0
    {
3002
0
      dev->super.begin_structure = fz_stext_begin_structure;
3003
0
      dev->super.end_structure = fz_stext_end_structure;
3004
0
    }
3005
0
    if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES))
3006
0
    {
3007
0
      dev->super.fill_path = fz_stext_fill_path;
3008
0
      dev->super.stroke_path = fz_stext_stroke_path;
3009
0
    }
3010
0
  }
3011
0
  dev->page = page;
3012
0
  dev->pen.x = 0;
3013
0
  dev->pen.y = 0;
3014
0
  dev->trm = fz_identity;
3015
0
  dev->lastchar = ' ';
3016
0
  dev->lastline = NULL;
3017
0
  dev->lasttext = NULL;
3018
0
  dev->lastbidi = 0;
3019
0
  dev->last_was_fake_bold = 1;
3020
0
  if (opts)
3021
0
    dev->opts = *opts;
3022
3023
  /* If we are ignoring images, then it'd be nice to skip the decode costs. BUT we still need them to tell
3024
   * us the bounds for ActualText, so we can only actually skip them if we are ignoring actualtext too. */
3025
0
  if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && (dev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0)
3026
0
    dev->super.hints |= FZ_DONT_DECODE_IMAGES;
3027
3028
0
  dev->rect_max = 0;
3029
0
  dev->rect_len = 0;
3030
0
  dev->rects = NULL;
3031
3032
  /* Push a new id */
3033
0
  fz_try(ctx)
3034
0
  {
3035
0
    fz_stext_page_details *deets;
3036
0
    size_t id;
3037
0
    deets = fz_pool_array_append(ctx, page->id_list, &id);
3038
0
    dev->id = (int)id;
3039
0
    deets->mediabox = mediabox;
3040
0
    deets->chapter = chapter_num;
3041
0
    deets->page = page_num;
3042
0
  }
3043
0
  fz_catch(ctx)
3044
0
  {
3045
0
    fz_free(ctx, dev);
3046
0
    fz_rethrow(ctx);
3047
0
  }
3048
3049
0
  page->mediabox = fz_union_rect(page->mediabox, mediabox);
3050
3051
0
  return (fz_device*)dev;
3052
0
}