Coverage Report

Created: 2026-03-31 07:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/fitz/stext-device.c
Line
Count
Source
1
// Copyright (C) 2004-2026 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
// You should have received a copy of the GNU Affero General Public License
15
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
16
//
17
// Alternative licensing terms are available from the licensor.
18
// For commercial licensing, see <https://www.artifex.com/> or contact
19
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
20
// CA 94129, USA, for further information.
21
22
#include "mupdf/fitz.h"
23
24
#include "glyphbox.h"
25
26
#include <float.h>
27
#include <string.h>
28
29
/* Simple layout structure */
30
31
fz_layout_block *fz_new_layout(fz_context *ctx)
32
0
{
33
0
  fz_pool *pool = fz_new_pool(ctx);
34
0
  fz_layout_block *block;
35
0
  fz_try(ctx)
36
0
  {
37
0
    block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
38
0
    block->pool = pool;
39
0
    block->head = NULL;
40
0
    block->tailp = &block->head;
41
0
  }
42
0
  fz_catch(ctx)
43
0
  {
44
0
    fz_drop_pool(ctx, pool);
45
0
    fz_rethrow(ctx);
46
0
  }
47
0
  return block;
48
0
}
49
50
void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
51
0
{
52
0
  if (block)
53
0
    fz_drop_pool(ctx, block->pool);
54
0
}
55
56
void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p)
57
0
{
58
0
  fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
59
0
  line->x = x;
60
0
  line->y = y;
61
0
  line->font_size = font_size;
62
0
  line->p = p;
63
0
  line->text = NULL;
64
0
  line->next = NULL;
65
0
  *block->tailp = line;
66
0
  block->tailp = &line->next;
67
0
  block->text_tailp = &line->text;
68
0
}
69
70
void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p)
71
0
{
72
0
  fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
73
0
  ch->x = x;
74
0
  ch->advance = advance;
75
0
  ch->p = p;
76
0
  ch->next = NULL;
77
0
  *block->text_tailp = ch;
78
0
  block->text_tailp = &ch->next;
79
0
}
80
81
/* Extract text into blocks and lines. */
82
83
0
#define PARAGRAPH_DIST 1.5f
84
0
#define SPACE_DIST 0.15f
85
0
#define SPACE_MAX_DIST 0.8f
86
0
#define BASE_MAX_DIST 0.8f
87
0
#define FAKE_BOLD_MAX_DIST 0.1f
88
89
/* We keep a stack of the different metatexts that apply at any
90
 * given point (normally none!). Whenever we get some content
91
 * with a metatext in force, we really want to update the bounds
92
 * for that metatext. But running along the whole list each time
93
 * would be painful. So we just update the bounds for dev->metatext
94
 * and rely on metatext_bounds() propagating it upwards 'just in
95
 * time' for us to use metatexts other than the latest one. This
96
 * also means we need to propagate bounds upwards when we pop
97
 * a metatext.
98
 *
99
 * Why do we need bounds at all? Well, suppose we get:
100
 *    /Span <</ActualText (c) >> BDC /Im0 Do EMC
101
 * Then where on the page do we put 'c' ? By collecting the
102
 * bounds, we can place 'c' wherever the image was.
103
 */
104
typedef struct metatext_t
105
{
106
  fz_metatext type;
107
  char *text;
108
  fz_rect bounds;
109
  struct metatext_t *prev;
110
} metatext_t;
111
112
typedef struct
113
{
114
  fz_point from;
115
  fz_point to;
116
  float thickness;
117
} rect_details;
118
119
typedef struct
120
{
121
  fz_device super;
122
  fz_stext_page *page;
123
  int id;
124
  fz_point pen, start;
125
  // maybe_bullet: True if the 'start' position recorded was done so after either some actualtext
126
  // on an image, or after a glyph that's known to be used for bullets. This is used to stop us
127
  // spotting an 'indented' paragraph, because it's possibly just a bulleted list.
128
  int maybe_bullet;
129
  fz_point lag_pen;
130
  fz_matrix trm;
131
  int new_obj;
132
  int lastchar;
133
  fz_stext_line *lastline;
134
  int lastbidi;
135
  int flags;
136
  int color;
137
  int last_was_fake_bold;
138
  const fz_text *lasttext;
139
  fz_stext_options opts;
140
141
  metatext_t *metatext;
142
143
  /* Store the last values we saw. We need this for flushing the actualtext. */
144
  struct
145
  {
146
    int valid;
147
    int clipped;
148
    fz_matrix trm;
149
    int wmode;
150
    int bidi_level;
151
    fz_font *font;
152
    int flags;
153
  } last;
154
155
  /* The list of 'rects' seen during processing (if we're collecting styles). */
156
  int rect_max;
157
  int rect_len;
158
  rect_details *rects;
159
160
  fz_stext_block *lazy_vectors;
161
  fz_stext_block *lazy_vectors_tail;
162
} fz_stext_device;
163
164
const char *fz_stext_options_usage =
165
  "Structured text options:\n"
166
  "\tpreserve-images: keep images in output\n"
167
  "\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
168
  "\tpreserve-spans: do not merge spans on the same line\n"
169
  "\tpreserve-whitespace: do not convert all whitespace into space characters\n"
170
  "\tinhibit-spaces: don't add spaces between gaps in the text\n"
171
  "\tparagraph-break: break blocks at paragraph boundaries\n"
172
  "\tdehyphenate: attempt to join up hyphenated words\n"
173
  "\tignore-actualtext: do not apply ActualText replacements\n"
174
  "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n"
175
  "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n"
176
  "\taccurate-bboxes: calculate char bboxes from the outlines\n"
177
  "\taccurate-ascenders: calculate ascender/descender from font glyphs\n"
178
  "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n"
179
  "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n"
180
  "\tclip: do not include text that is completely clipped\n"
181
  "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n"
182
  "\tstructured: collect structure markup\n"
183
  "\tvectors: include vector bboxes in output\n"
184
  "\tsegment: attempt to segment the page\n"
185
  "\ttable-hunt: hunt for tables within a (segmented) page\n"
186
  "\tresolution: resolution to render at\n"
187
  "\n";
188
189
/* Find the current actualtext, if any. Will abort if dev == NULL. */
190
static metatext_t *
191
find_actualtext(fz_stext_device *dev)
192
0
{
193
0
  metatext_t *mt = dev->metatext;
194
195
0
  while (mt && mt->type != FZ_METATEXT_ACTUALTEXT)
196
0
    mt = mt->prev;
197
198
0
  return mt;
199
0
}
200
201
/* Find the bounds of the given metatext. Will abort if mt or
202
 * dev are NULL. */
203
static fz_rect *
204
metatext_bounds(metatext_t *mt, fz_stext_device *dev)
205
0
{
206
0
  metatext_t *mt2 = dev->metatext;
207
208
0
  while (mt2 != mt)
209
0
  {
210
0
    mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds);
211
0
    mt2 = mt2->prev;
212
0
  }
213
214
0
  return &mt->bounds;
215
0
}
216
217
/* Find the bounds of the current actualtext, or NULL if there
218
 * isn't one. Will abort if dev is NULL. */
219
static fz_rect *
220
actualtext_bounds(fz_stext_device *dev)
221
0
{
222
0
  metatext_t *mt = find_actualtext(dev);
223
224
0
  if (mt == NULL)
225
0
    return NULL;
226
227
0
  return metatext_bounds(mt, dev);
228
0
}
229
230
fz_stext_page *
231
fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
232
0
{
233
0
  fz_pool *pool = fz_new_pool(ctx);
234
0
  fz_stext_page *page = NULL;
235
0
  fz_try(ctx)
236
0
  {
237
0
    page = fz_pool_alloc(ctx, pool, sizeof(*page));
238
0
    page->refs = 1;
239
0
    page->pool = pool;
240
0
    page->mediabox = mediabox;
241
0
    page->first_block = NULL;
242
0
    page->last_block = NULL;
243
0
    page->id_list = fz_new_pool_array(ctx, pool, fz_stext_page_details, 4);
244
0
  }
245
0
  fz_catch(ctx)
246
0
  {
247
0
    fz_drop_pool(ctx, pool);
248
0
    fz_rethrow(ctx);
249
0
  }
250
0
  return page;
251
0
}
252
253
static void
254
drop_run(fz_context *ctx, fz_stext_block *block)
255
0
{
256
0
  fz_stext_line *line;
257
0
  fz_stext_char *ch;
258
0
  while (block)
259
0
  {
260
0
    switch (block->type)
261
0
    {
262
0
    case FZ_STEXT_BLOCK_IMAGE:
263
0
      fz_drop_image(ctx, block->u.i.image);
264
0
      break;
265
0
    case FZ_STEXT_BLOCK_TEXT:
266
0
      for (line = block->u.t.first_line; line; line = line->next)
267
0
        for (ch = line->first_char; ch; ch = ch->next)
268
0
          fz_drop_font(ctx, ch->font);
269
0
      break;
270
0
    case FZ_STEXT_BLOCK_STRUCT:
271
0
      drop_run(ctx, block->u.s.down->first_block);
272
0
      break;
273
0
    default:
274
0
      break;
275
0
    }
276
0
    block = block->next;
277
0
  }
278
0
}
279
280
fz_stext_page_details *fz_stext_page_details_for_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block)
281
0
{
282
0
  if (block == NULL || page == NULL)
283
0
    fz_throw(ctx, FZ_ERROR_ARGUMENT, "page details require a page and a block");
284
285
0
  return (fz_stext_page_details *)fz_pool_array_lookup(ctx, page->id_list, block->id);
286
0
}
287
288
fz_stext_page *
289
fz_keep_stext_page(fz_context *ctx, fz_stext_page *page)
290
0
{
291
0
  return fz_keep_imp(ctx, page, &page->refs);
292
0
}
293
294
void
295
fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
296
0
{
297
0
  if (page == NULL)
298
0
    return;
299
300
0
  if (fz_drop_imp(ctx, page, &page->refs))
301
0
  {
302
0
    drop_run(ctx, page->first_block);
303
0
    fz_drop_pool(ctx, page->pool);
304
0
  }
305
0
}
306
307
/*
308
 * This adds a new block at the end of the page. This should not be used
309
 * to add 'struct' blocks to the page as those have to be added internally,
310
 * with more complicated pointer setup.
311
 */
312
static fz_stext_block *
313
add_block_to_page(fz_context *ctx, fz_stext_page *page, int type, int id)
314
0
{
315
0
  fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
316
0
  block->bbox = fz_empty_rect; /* Fixes bug 703267. */
317
0
  block->prev = page->last_block;
318
0
  block->type = type;
319
0
  block->id = id;
320
0
  if (page->last_struct)
321
0
  {
322
0
    if (page->last_struct->last_block)
323
0
    {
324
0
      block->prev = page->last_struct->last_block;
325
0
      block->prev->next = block;
326
0
      page->last_struct->last_block = block;
327
0
    }
328
0
    else
329
0
      page->last_struct->last_block = page->last_struct->first_block = block;
330
0
  }
331
0
  else if (!page->last_block)
332
0
  {
333
0
    assert(!page->first_block);
334
0
    page->first_block = page->last_block = block;
335
0
  }
336
0
  else
337
0
  {
338
0
    page->last_block->next = block;
339
0
    page->last_block = block;
340
0
  }
341
0
  return block;
342
0
}
343
344
static fz_stext_block *
345
add_lazy_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, int id)
346
0
{
347
0
  fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
348
0
  block->bbox = fz_empty_rect;
349
0
  block->prev = tdev->lazy_vectors_tail;
350
0
  block->type = FZ_STEXT_BLOCK_VECTOR;
351
0
  block->id = id;
352
353
0
  if (tdev->lazy_vectors == NULL)
354
0
    tdev->lazy_vectors = block;
355
0
  else
356
0
    tdev->lazy_vectors_tail->next = block;
357
0
  tdev->lazy_vectors_tail = block;
358
359
0
  return block;
360
0
}
361
362
static void
363
flush_lazy_vectors(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev)
364
0
{
365
0
  if (tdev->lazy_vectors == NULL)
366
0
    return;
367
368
0
  if (page->last_struct)
369
0
  {
370
0
    if (page->last_struct->last_block)
371
0
    {
372
0
      page->last_struct->last_block->next = tdev->lazy_vectors;
373
0
      tdev->lazy_vectors->prev = page->last_struct->last_block;
374
0
      page->last_struct->last_block = tdev->lazy_vectors_tail;
375
0
    }
376
0
    else
377
0
    {
378
0
      page->last_struct->first_block = tdev->lazy_vectors;
379
0
      page->last_struct->last_block = tdev->lazy_vectors_tail;
380
0
    }
381
0
  }
382
0
  else if (!page->last_block)
383
0
  {
384
0
    page->first_block = tdev->lazy_vectors;
385
0
    page->last_block = tdev->lazy_vectors_tail;
386
0
  }
387
0
  else
388
0
  {
389
0
    page->last_block->next = tdev->lazy_vectors;
390
0
    tdev->lazy_vectors->prev = page->last_block;
391
0
    page->last_block = tdev->lazy_vectors_tail;
392
0
  }
393
394
0
  tdev->lazy_vectors = tdev->lazy_vectors_tail = NULL;
395
0
}
396
397
static fz_stext_block *
398
add_text_block_to_page(fz_context *ctx, fz_stext_page *page, int id)
399
0
{
400
0
  return add_block_to_page(ctx, page, FZ_STEXT_BLOCK_TEXT, id);
401
0
}
402
403
static fz_stext_block *
404
add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image, int id)
405
0
{
406
0
  fz_stext_block *block = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_IMAGE, id);
407
0
  block->u.i.transform = ctm;
408
0
  block->u.i.image = fz_keep_image(ctx, image);
409
0
  block->bbox = fz_transform_rect(fz_unit_rect, ctm);
410
0
  return block;
411
0
}
412
413
static fz_stext_line *
414
add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi)
415
0
{
416
0
  fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
417
0
  line->prev = block->u.t.last_line;
418
0
  if (!block->u.t.first_line)
419
0
    block->u.t.first_line = block->u.t.last_line = line;
420
0
  else
421
0
  {
422
0
    block->u.t.last_line->next = line;
423
0
    block->u.t.last_line = line;
424
0
  }
425
426
0
  line->dir = *dir;
427
0
  line->wmode = wmode;
428
429
0
  return line;
430
0
}
431
432
0
#define NON_ACCURATE_GLYPH_ADDED_SPACE (-2)
433
0
#define NON_ACCURATE_GLYPH (-1)
434
435
static fz_stext_char *
436
add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags)
437
0
{
438
0
  fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
439
0
  fz_point a, d;
440
441
0
  if (!line->first_char)
442
0
    line->first_char = line->last_char = ch;
443
0
  else
444
0
  {
445
0
    line->last_char->next = ch;
446
0
    line->last_char = ch;
447
0
  }
448
449
0
  ch->c = c;
450
0
  ch->argb = color;
451
0
  ch->bidi = bidi;
452
0
  ch->origin = *p;
453
0
  ch->size = size;
454
0
  ch->font = fz_keep_font(ctx, font);
455
0
  ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0) | (synthetic > 1 ? FZ_STEXT_SYNTHETIC_LARGE : 0);
456
0
  if (font->flags.is_bold)
457
0
    ch->flags |= FZ_STEXT_BOLD;
458
459
0
  if (line->wmode == 0)
460
0
  {
461
0
    fz_rect bounds;
462
0
    int bounded = 0;
463
0
    a.x = 0;
464
0
    d.x = 0;
465
0
    if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE)
466
0
    {
467
      /* Added space, in accurate mode. */
468
0
      a.y = d.y = 0;
469
0
    }
470
0
    else if (glyph == NON_ACCURATE_GLYPH)
471
0
    {
472
      /* Non accurate mode. */
473
0
      a.y = fz_font_ascender(ctx, font);
474
0
      d.y = fz_font_descender(ctx, font);
475
0
    }
476
0
    else
477
0
    {
478
      /* Any glyph in accurate mode */
479
0
      bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
480
0
      bounded = 1;
481
0
      a.y = bounds.y1;
482
0
      d.y = bounds.y0;
483
0
    }
484
0
    if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS)
485
0
    {
486
0
      if (!bounded)
487
0
        bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
488
0
      if (a.x > bounds.x0)
489
0
        a.x = bounds.x0;
490
0
      if (d.y < bounds.x1)
491
0
        d.y = bounds.x1;
492
0
    }
493
0
  }
494
0
  else
495
0
  {
496
0
    a.x = 1;
497
0
    d.x = 0;
498
0
    a.y = 0;
499
0
    d.y = 0;
500
0
  }
501
0
  a = fz_transform_vector(a, trm);
502
0
  d = fz_transform_vector(d, trm);
503
504
0
  ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
505
0
  ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
506
0
  ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
507
0
  ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
508
509
0
  return ch;
510
0
}
511
512
static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail)
513
0
{
514
0
  fz_stext_char *prev, *next;
515
0
  prev = tail;
516
0
  while (curr != tail)
517
0
  {
518
0
    next = curr->next;
519
0
    curr->next = prev;
520
0
    prev = curr;
521
0
    curr = next;
522
0
  }
523
0
  return prev;
524
0
}
525
526
static void reverse_bidi_line(fz_stext_line *line)
527
0
{
528
0
  fz_stext_char *a, *b, **prev;
529
0
  prev = &line->first_char;
530
0
  for (a = line->first_char; a; a = a->next)
531
0
  {
532
0
    if (a->bidi)
533
0
    {
534
0
      b = a;
535
0
      while (b->next && b->next->bidi)
536
0
        b = b->next;
537
0
      if (a != b)
538
0
        *prev = reverse_bidi_span(a, b->next);
539
0
    }
540
0
    prev = &a->next;
541
0
    line->last_char = a;
542
0
  }
543
0
}
544
545
int fz_is_unicode_hyphen(int c)
546
0
{
547
  /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
548
0
  return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
549
0
}
550
551
static float
552
vec_dot(const fz_point *a, const fz_point *b)
553
0
{
554
0
  return a->x * b->x + a->y * b->y;
555
0
}
556
557
static int may_add_space(int lastchar)
558
0
{
559
  /* Basic latin, greek, cyrillic, hebrew, arabic,
560
   * general punctuation,
561
   * superscripts and subscripts,
562
   * and currency symbols.
563
   */
564
0
  return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF)));
565
0
}
566
567
0
#define FAKEBOLD_THRESHOLD_RECIP (1.0f / FAKE_BOLD_MAX_DIST)
568
569
static int
570
is_within_fake_bold_distance(float a, float b, float size)
571
0
{
572
0
  a -= b;
573
0
  if (a < 0)
574
0
    a = -a;
575
576
0
  return FAKEBOLD_THRESHOLD_RECIP * a < size;
577
0
}
578
579
static int
580
font_equiv(fz_context *ctx, fz_font *f, fz_font *g)
581
0
{
582
0
  unsigned char fdigest[16];
583
0
  unsigned char gdigest[16];
584
585
0
  if (f == g)
586
0
    return 1;
587
588
0
  if (strcmp(f->name, g->name) != 0)
589
0
    return 0;
590
591
0
  if (f->buffer == NULL || g->buffer == NULL)
592
0
    return 0;
593
594
0
  fz_font_digest(ctx, f, fdigest);
595
0
  fz_font_digest(ctx, g, gdigest);
596
597
0
  return (memcmp(fdigest, gdigest, 16) == 0);
598
0
}
599
600
static int
601
check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags)
602
0
{
603
0
  fz_stext_line *line;
604
0
  fz_stext_char *ch;
605
606
0
  for (; block != NULL; block = block->next)
607
0
  {
608
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
609
0
    {
610
0
      if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags))
611
0
        return 1;
612
0
    }
613
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
614
0
    {
615
0
      for (line = block->u.t.first_line; line != NULL; line = line->next)
616
0
      {
617
0
        fz_stext_char *pr = NULL;
618
0
        for (ch = line->first_char; ch != NULL; ch = ch->next)
619
0
        {
620
          /* Not perfect, but it'll do! */
621
0
          if (ch->c == c && is_within_fake_bold_distance(ch->origin.x, p.x, size) && is_within_fake_bold_distance(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font))
622
0
          {
623
            /* If we were filled before, and we are stroking now... */
624
0
            if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED &&
625
0
              (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED)
626
0
            {
627
              /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */
628
0
              ch->flags |= flags;
629
0
              return 1;
630
0
            }
631
            /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these
632
             * as boldening if either the char before, or the char after were also boldened. */
633
0
            ch->flags |= flags;
634
635
0
            if (c == ' ')
636
0
            {
637
0
              if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) ||
638
0
                (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0))
639
0
              {
640
                /* OK, we can be bold. */
641
0
                ch->flags |= FZ_STEXT_BOLD;
642
0
              }
643
              /* Whether we have recorded this as being bold or not, still
644
               * claim we did, so we swallow the space and don't reemit it. */
645
0
              return 1;
646
0
            }
647
0
            else
648
0
            {
649
0
              ch->flags |= FZ_STEXT_BOLD;
650
0
              return 1;
651
0
            }
652
0
          }
653
0
          pr = ch;
654
0
        }
655
0
      }
656
0
    }
657
0
  }
658
659
0
  return 0;
660
0
}
661
662
static int
663
plausible_bullet(int c)
664
0
{
665
0
  return (c == '*' ||
666
0
    c == 0x00B7 || /* Middle Dot */
667
0
    c == 0x2022 || /* Bullet */
668
0
    c == 0x2023 || /* Triangular Bullet */
669
0
    c == 0x2043 || /* Hyphen Bullet */
670
0
    c == 0x204C || /* Back leftwards bullet */
671
0
    c == 0x204D || /* Back rightwards bullet */
672
0
    c == 0x2219 || /* Bullet operator */
673
0
    c == 0x25C9 || /* Fisheye */
674
0
    c == 0x25CB || /* White circle */
675
0
    c == 0x25CF || /* Black circle */
676
0
    c == 0x25D8 || /* Inverse Bullet */
677
0
    c == 0x25E6 || /* White Bullet */
678
0
    c == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */
679
0
    c == 0x261a || /* Black left pointing index */
680
0
    c == 0x261b || /* Black right pointing index */
681
0
    c == 0x261c || /* White left pointing index */
682
0
    c == 0x261d || /* White up pointing index */
683
0
    c == 0x261e || /* White right pointing index */
684
0
    c == 0x261f || /* White down pointing index */
685
0
    c == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */
686
0
    c == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */
687
0
    c == 0x29BE || /* Circled White Bullet */
688
0
    c == 0x29BF || /* Circled Bullet */
689
0
    c == 0x2660 || /* Black Spade suit */
690
0
    c == 0x2661 || /* White Heart suit */
691
0
    c == 0x2662 || /* White Diamond suit */
692
0
    c == 0x2663 || /* Black Club suit */
693
0
    c == 0x2664 || /* White Spade suit */
694
0
    c == 0x2665 || /* Black Heart suit */
695
0
    c == 0x2666 || /* Black Diamond suit */
696
0
    c == 0x2667 || /* White Clud suit */
697
0
    c == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */
698
0
    c == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */
699
0
    c == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */
700
0
    c == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */
701
0
    c == 0x1f597 || /* White down pointing left hand index */
702
0
    c == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */
703
0
    c == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */
704
0
    c == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */
705
0
    c == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */
706
0
    c == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */
707
0
    c == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */
708
0
    c == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */
709
0
    c == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */
710
0
    c == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */
711
0
    c == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */
712
0
    c == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */
713
0
    c == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */
714
0
    c == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */
715
0
    c == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */
716
0
    c == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */
717
0
    c == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */
718
0
    0);
719
0
}
720
721
static void
722
fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags)
723
0
{
724
0
  fz_stext_page *page = dev->page;
725
0
  fz_stext_block *cur_block;
726
0
  fz_stext_line *cur_line = NULL;
727
728
0
  int new_para = 0;
729
0
  int new_line = 1;
730
0
  int add_space = 0;
731
0
  fz_point dir, ndir, p, q;
732
0
  float size;
733
0
  fz_point delta;
734
0
  float spacing = 0;
735
0
  float base_offset = 0;
736
0
  float dist;
737
738
  /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */
739
0
  bidi = bidi & 1;
740
741
  /* dir = direction vector for motion. ndir = normalised(dir) */
742
0
  if (wmode == 0)
743
0
  {
744
0
    dir.x = 1;
745
0
    dir.y = 0;
746
0
  }
747
0
  else
748
0
  {
749
0
    dir.x = 0;
750
0
    dir.y = -1;
751
0
  }
752
0
  dir = fz_transform_vector(dir, trm);
753
0
  ndir = fz_normalize_vector(dir);
754
755
0
  size = fz_matrix_expansion(trm);
756
757
  /* We need to identify where glyphs 'start' (p) and 'stop' (q).
758
   * Each glyph holds its 'start' position, and the next glyph in the
759
   * span (or span->max if there is no next glyph) holds its 'end'
760
   * position.
761
   *
762
   * For both horizontal and vertical motion, trm->{e,f} gives the
763
   * origin (usually the bottom left) of the glyph.
764
   *
765
   * In horizontal mode:
766
   *   + p is bottom left.
767
   *   + q is the bottom right
768
   * In vertical mode:
769
   *   + p is top left (where it advanced from)
770
   *   + q is bottom left
771
   */
772
0
  if (wmode == 0)
773
0
  {
774
0
    p.x = trm.e;
775
0
    p.y = trm.f;
776
0
    q.x = trm.e + adv * dir.x;
777
0
    q.y = trm.f + adv * dir.y;
778
0
  }
779
0
  else
780
0
  {
781
0
    p.x = trm.e - adv * dir.x;
782
0
    p.y = trm.f - adv * dir.y;
783
0
    q.x = trm.e;
784
0
    q.y = trm.f;
785
0
  }
786
787
  //printf("%g,%g \"%c\" %g,%g\n", p.x, p.y, c, q.x, q.y);
788
789
0
  if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0)
790
0
  {
791
0
    if (glyph == -1)
792
0
    {
793
0
      if (dev->last_was_fake_bold)
794
0
        return;
795
0
    }
796
0
    else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags))
797
0
    {
798
0
      dev->last_was_fake_bold = 1;
799
0
      return;
800
0
    }
801
0
    dev->last_was_fake_bold = 0;
802
0
  }
803
804
  /* Find current position to enter new text. */
805
0
  cur_block = page->last_struct ? page->last_struct->last_block : page->last_block;
806
0
  if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
807
0
    cur_block = NULL;
808
0
  cur_line = cur_block ? cur_block->u.t.last_line : NULL;
809
810
  /* We use glyph == -2 to indicate a no-glyph char from an actualtext. The position
811
   * is valid though, so we want to advance the pen for these. */
812
0
  if (cur_line && glyph == -1)
813
0
  {
814
    /* Don't advance pen or break lines for no-glyph characters in a cluster */
815
0
    add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags);
816
0
    dev->lastbidi = bidi;
817
0
    dev->lastchar = c;
818
0
    dev->lastline = cur_line;
819
0
    return;
820
0
  }
821
822
0
  if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
823
0
  {
824
    /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
825
     * then we can't append to the current block/line. */
826
0
    new_para = 1;
827
0
    new_line = 1;
828
0
  }
829
0
  else
830
0
  {
831
    /* Detect fake bold where text is printed twice in the same place. */
832
    /* Largely supplanted by the check_for_fake_bold mechanism above,
833
     * but we leave this in for backward compatibility as it's cheap,
834
     * and works even when FZ_STEXT_COLLECT_STYLES is not set. */
835
0
    dist = hypotf(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y) / size;
836
    /* This can trigger improperly for glyphs that come from actualtext
837
     * as they are frequently overlaid. Therefore rely on glyph >= 0. */
838
0
    if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar && glyph >= 0)
839
0
      return;
840
841
    /* Calculate how far we've moved since the last character. */
842
0
    delta.x = p.x - dev->pen.x;
843
0
    delta.y = p.y - dev->pen.y;
844
845
    /* The transform has not changed, so we know we're in the same
846
     * direction. Calculate 2 distances; how far off the previous
847
     * baseline we are, together with how far along the baseline
848
     * we are from the expected position. */
849
0
    spacing = (ndir.x * delta.x + ndir.y * delta.y) / size;
850
0
    base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size;
851
852
    /* Only a small amount off the baseline - we'll take this */
853
0
    if (fabsf(base_offset) < BASE_MAX_DIST)
854
0
    {
855
      /* If mixed LTR and RTL content */
856
0
      if ((bidi & 1) != (dev->lastbidi & 1))
857
0
      {
858
        /* Ignore jumps within line when switching between LTR and RTL text. */
859
0
        new_line = 0;
860
0
      }
861
862
      /* RTL */
863
0
      else if (bidi & 1)
864
0
      {
865
0
        fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y);
866
0
        float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv;
867
868
        /* If the pen is where we would have been if we
869
         * had advanced backwards from the previous
870
         * character by this character's advance, we
871
         * are probably seeing characters emitted in
872
         * logical order.
873
         */
874
0
        if (fabsf(logical_spacing) < SPACE_DIST)
875
0
        {
876
0
          new_line = 0;
877
0
        }
878
879
        /* However, if the pen has advanced to where we would expect it
880
         * in an LTR context, we're seeing them emitted in visual order
881
         * and should flag them for reordering!
882
         */
883
0
        else if (fabsf(spacing) < SPACE_DIST)
884
0
        {
885
0
          bidi = 3; /* mark line as visual */
886
0
          new_line = 0;
887
0
        }
888
889
        /* And any other small jump could be a missing space. */
890
0
        else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST)
891
0
        {
892
0
          if (wmode == 0 && may_add_space(dev->lastchar))
893
0
            add_space = 1;
894
0
          new_line = 0;
895
0
        }
896
0
        else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
897
0
        {
898
          /* Motion is in line, but negative. We've probably got overlapping
899
           * chars here. Live with it. */
900
0
          new_line = 0;
901
0
        }
902
0
        else if (spacing > 0 && spacing < SPACE_MAX_DIST)
903
0
        {
904
0
          bidi = 3; /* mark line as visual */
905
0
          if (wmode == 0 && may_add_space(dev->lastchar))
906
0
            add_space = 1 + (spacing > SPACE_DIST*2);
907
0
          new_line = 0;
908
0
        }
909
910
0
        else
911
0
        {
912
          /* Motion is large and unexpected (probably a new table column). */
913
0
          new_line = 1;
914
0
        }
915
0
      }
916
917
      /* LTR or neutral character */
918
0
      else
919
0
      {
920
0
        if (fabsf(spacing) < SPACE_DIST)
921
0
        {
922
          /* Motion is in line and small enough to ignore. */
923
0
          new_line = 0;
924
0
        }
925
0
        else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
926
0
        {
927
          /* Motion is in line, but negative. We've probably got overlapping
928
           * chars here. Live with it. */
929
0
          new_line = 0;
930
0
        }
931
0
        else if (spacing > 0 && spacing < SPACE_MAX_DIST)
932
0
        {
933
          /* Motion is forward in line and large enough to warrant us adding a space. */
934
0
          if (wmode == 0 && may_add_space(dev->lastchar))
935
0
            add_space = 1 + (spacing > SPACE_DIST*2);
936
0
          new_line = 0;
937
0
        }
938
0
        else
939
0
        {
940
          /* Motion is large and unexpected (probably a new table column). */
941
0
          new_line = 1;
942
0
        }
943
0
      }
944
0
    }
945
946
    /* Enough for a new line, but not enough for a new paragraph */
947
0
    else if (fabsf(base_offset) <= PARAGRAPH_DIST)
948
0
    {
949
      /* Check indent to spot text-indent style paragraphs */
950
0
      if (wmode == 0 && cur_line && dev->new_obj)
951
0
        if ((p.x - dev->start.x) > 0.5f && !dev->maybe_bullet)
952
0
          new_para = 1;
953
0
      new_line = 1;
954
0
    }
955
956
    /* Way off the baseline - open a new paragraph */
957
0
    else
958
0
    {
959
0
      new_para = 1;
960
0
      new_line = 1;
961
0
    }
962
0
  }
963
964
  /* Start a new block (but only at the beginning of a text object) */
965
0
  if (new_para || !cur_block)
966
0
  {
967
0
    flush_lazy_vectors(ctx, page, dev);
968
0
    cur_block = add_text_block_to_page(ctx, page, dev->id);
969
0
    cur_line = cur_block->u.t.last_line;
970
0
  }
971
972
0
  if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(dev->lastchar) && dev->lastline != NULL)
973
0
    dev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED;
974
975
  /* Start a new line */
976
0
  if (new_line || !cur_line || force_new_line)
977
0
  {
978
0
    cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi);
979
0
    dev->start = p;
980
0
    if (glyph == -2)
981
0
      dev->maybe_bullet = 1;
982
0
    else
983
0
      dev->maybe_bullet = plausible_bullet(c);
984
0
  }
985
986
  /* Henceforth treat such non-glyphs in the usual way. */
987
0
  if (glyph == -2)
988
0
    glyph = -1;
989
990
  /* Add synthetic space */
991
0
  if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
992
0
    add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, add_space, flags, dev->flags);
993
994
0
  add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags);
995
996
0
  dev->lastchar = c;
997
0
  dev->lastbidi = bidi;
998
0
  dev->lastline = cur_line;
999
0
  dev->lag_pen = p;
1000
0
  dev->pen = q;
1001
1002
0
  dev->new_obj = 0;
1003
0
  dev->trm = trm;
1004
0
}
1005
1006
static void
1007
fz_add_stext_char(fz_context *ctx,
1008
  fz_stext_device *dev,
1009
  fz_font *font,
1010
  int c,
1011
  int glyph,
1012
  fz_matrix trm,
1013
  float adv,
1014
  int wmode,
1015
  int bidi,
1016
  int force_new_line,
1017
  int flags)
1018
0
{
1019
  /* ignore when one unicode character maps to multiple glyphs */
1020
0
  if (c == -1)
1021
0
    return;
1022
1023
0
  if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS)
1024
0
    fz_calculate_font_ascender_descender(ctx, font);
1025
1026
0
  if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
1027
0
  {
1028
0
    switch (c)
1029
0
    {
1030
0
    case 0xFB00: /* ff */
1031
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1032
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
1033
0
      return;
1034
0
    case 0xFB01: /* fi */
1035
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1036
0
      fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
1037
0
      return;
1038
0
    case 0xFB02: /* fl */
1039
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1040
0
      fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
1041
0
      return;
1042
0
    case 0xFB03: /* ffi */
1043
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1044
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
1045
0
      fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
1046
0
      return;
1047
0
    case 0xFB04: /* ffl */
1048
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1049
0
      fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
1050
0
      fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
1051
0
      return;
1052
0
    case 0xFB05: /* long st */
1053
0
    case 0xFB06: /* st */
1054
0
      fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags);
1055
0
      fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags);
1056
0
      return;
1057
0
    }
1058
0
  }
1059
1060
0
  if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
1061
0
  {
1062
0
    switch (c)
1063
0
    {
1064
0
    case 0x0009: /* tab */
1065
0
    case 0x0020: /* space */
1066
0
    case 0x00A0: /* no-break space */
1067
0
    case 0x1680: /* ogham space mark */
1068
0
    case 0x180E: /* mongolian vowel separator */
1069
0
    case 0x2000: /* en quad */
1070
0
    case 0x2001: /* em quad */
1071
0
    case 0x2002: /* en space */
1072
0
    case 0x2003: /* em space */
1073
0
    case 0x2004: /* three-per-em space */
1074
0
    case 0x2005: /* four-per-em space */
1075
0
    case 0x2006: /* six-per-em space */
1076
0
    case 0x2007: /* figure space */
1077
0
    case 0x2008: /* punctuation space */
1078
0
    case 0x2009: /* thin space */
1079
0
    case 0x200A: /* hair space */
1080
0
    case 0x202F: /* narrow no-break space */
1081
0
    case 0x205F: /* medium mathematical space */
1082
0
    case 0x3000: /* ideographic space */
1083
0
      c = ' ';
1084
0
    }
1085
0
  }
1086
1087
0
  fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags);
1088
0
}
1089
1090
static fz_rect
1091
current_clip(fz_context *ctx, fz_stext_device *dev)
1092
0
{
1093
0
  fz_rect r = fz_infinite_rect;
1094
1095
0
  if (dev->flags & FZ_STEXT_CLIP)
1096
0
  {
1097
0
    r = fz_device_current_scissor(ctx, &dev->super);
1098
0
    r = fz_intersect_rect(r, dev->page->mediabox);
1099
0
  }
1100
0
  if (dev->flags & FZ_STEXT_CLIP_RECT)
1101
0
    r = fz_intersect_rect(r, dev->opts.clip);
1102
1103
0
  return r;
1104
0
}
1105
1106
static void
1107
do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags)
1108
0
{
1109
0
  fz_font *font = span->font;
1110
0
  fz_matrix tm = span->trm;
1111
0
  float adv;
1112
0
  int unicode;
1113
0
  int i;
1114
1115
0
  for (i = start; i < end; i++)
1116
0
  {
1117
0
    if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1118
0
    {
1119
0
      fz_rect r = current_clip(ctx, dev);
1120
0
      if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
1121
0
      {
1122
0
        dev->last.clipped = 1;
1123
0
        continue;
1124
0
      }
1125
0
    }
1126
0
    dev->last.clipped = 0;
1127
1128
    /* Calculate new pen location and delta */
1129
0
    tm.e = span->items[i].x;
1130
0
    tm.f = span->items[i].y;
1131
0
    dev->last.trm = fz_concat(tm, ctm);
1132
0
    dev->last.bidi_level = span->bidi_level;
1133
0
    dev->last.wmode = span->wmode;
1134
0
    if (font != dev->last.font)
1135
0
    {
1136
0
      fz_drop_font(ctx, dev->last.font);
1137
0
      dev->last.font = fz_keep_font(ctx, font);
1138
0
    }
1139
0
    dev->last.valid = 1;
1140
0
    dev->last.flags = flags;
1141
1142
    /* Calculate bounding box and new pen position based on font metrics */
1143
0
    if (span->items[i].gid >= 0)
1144
0
      adv = span->items[i].adv;
1145
0
    else
1146
0
      adv = 0;
1147
1148
0
    unicode = span->items[i].ucs;
1149
0
    if (unicode == FZ_REPLACEMENT_CHARACTER)
1150
0
    {
1151
0
      if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE)
1152
0
      {
1153
0
        unicode = span->items[i].cid;
1154
0
        flags |= FZ_STEXT_UNICODE_IS_CID;
1155
0
      }
1156
0
      else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE)
1157
0
      {
1158
0
        unicode = span->items[i].gid;
1159
0
        flags |= FZ_STEXT_UNICODE_IS_GID;
1160
0
      }
1161
0
    }
1162
1163
    /* Send the chars we have through. */
1164
0
    fz_add_stext_char(ctx, dev, font,
1165
0
      unicode,
1166
0
      span->items[i].gid,
1167
0
      dev->last.trm,
1168
0
      adv,
1169
0
      dev->last.wmode,
1170
0
      dev->last.bidi_level,
1171
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1172
0
      flags);
1173
0
  }
1174
0
}
1175
1176
static int
1177
rune_index(const char *utf8, size_t idx)
1178
0
{
1179
0
  int rune;
1180
1181
0
  do
1182
0
  {
1183
0
    int len = fz_chartorune(&rune, utf8);
1184
0
    if (rune == 0)
1185
0
      return -1;
1186
0
    utf8 += len;
1187
0
  }
1188
0
  while (idx--);
1189
1190
0
  return rune;
1191
0
}
1192
1193
static void
1194
flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i, int end, float adv)
1195
0
{
1196
0
  if (*actualtext == 0)
1197
0
    return;
1198
1199
0
  if (!dev->last.valid)
1200
0
    return;
1201
1202
0
  if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1203
0
    if (dev->last.clipped)
1204
0
      return;
1205
1206
0
  if (adv != 0)
1207
0
  {
1208
0
    const char *at = actualtext;
1209
0
    int j = i;
1210
1211
0
    while (end < 0 || (end >= 0 && i < end))
1212
0
    {
1213
0
      int rune;
1214
0
      at += fz_chartorune(&rune, at);
1215
1216
0
      if (rune == 0)
1217
0
        break;
1218
0
      j++;
1219
0
    }
1220
1221
0
    if (j != i)
1222
0
      adv /= (j - i);
1223
0
  }
1224
1225
0
  while (end < 0 || (end >= 0 && i < end))
1226
0
  {
1227
0
    int rune;
1228
0
    actualtext += fz_chartorune(&rune, actualtext);
1229
1230
0
    if (rune == 0)
1231
0
      break;
1232
1233
0
    dev->last.trm.e = dev->pen.x;
1234
0
    dev->last.trm.f = dev->pen.y;
1235
1236
0
    fz_add_stext_char(ctx, dev, dev->last.font,
1237
0
      rune,
1238
0
      -2,
1239
0
      dev->last.trm,
1240
0
      adv,
1241
0
      dev->last.wmode,
1242
0
      dev->last.bidi_level,
1243
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1244
0
      dev->last.flags);
1245
0
    i++;
1246
0
  }
1247
0
}
1248
1249
static void
1250
do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags)
1251
0
{
1252
  /* We are within an actualtext block. This means we can't just add the chars
1253
   * as they are. We need to add the chars as they are meant to be. Sadly the
1254
   * actualtext mechanism doesn't help us at all with positioning. */
1255
0
  fz_font *font = span->font;
1256
0
  fz_matrix tm = span->trm;
1257
0
  float adv;
1258
0
  int start, i, end;
1259
0
  char *actualtext = mt->text;
1260
0
  size_t z = fz_utflen(actualtext);
1261
1262
  /* If actualtext is empty, nothing to do! */
1263
0
  if (z == 0)
1264
0
    return;
1265
1266
  /* Now, we HOPE that the creator of a PDF will minimise the actual text
1267
   * differences, so that we'll get:
1268
   *   "Politicians <Actualtext="lie">fib</ActualText>, always."
1269
   * rather than:
1270
   *   "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText>
1271
   * but experience with PDF files tells us that this won't always be the case.
1272
   *
1273
   * We try to minimise the actualtext section here, just in case.
1274
   */
1275
1276
  /* Spot a matching prefix and send it. */
1277
0
  for (start = 0; start < span->len; start++)
1278
0
  {
1279
0
    int rune;
1280
0
    int len = fz_chartorune(&rune, actualtext);
1281
0
    if (span->items[start].ucs != rune || rune == 0)
1282
0
      break;
1283
0
    actualtext += len; z--;
1284
0
  }
1285
0
  if (start != 0)
1286
0
    do_extract(ctx, dev, span, ctm, 0, start, flags);
1287
1288
0
  if (start == span->len)
1289
0
  {
1290
    /* The prefix has consumed all this object. Just shorten the actualtext and we'll
1291
     * catch the rest next time. */
1292
0
    z = strlen(actualtext)+1;
1293
0
    memmove(mt->text, actualtext, z);
1294
0
    return;
1295
0
  }
1296
1297
  /* We haven't consumed the whole string, so there must be runes left.
1298
   * Shut coverity up. */
1299
0
  assert(z != 0);
1300
1301
  /* Spot a matching postfix. Can't send it til the end. */
1302
0
  for (end = span->len; end > start; end--)
1303
0
  {
1304
    /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */
1305
0
    int rune = rune_index(actualtext, z-1);
1306
0
    if (span->items[end-1].ucs != rune)
1307
0
      break;
1308
0
    z--;
1309
0
  }
1310
  /* So we can send end -> span->len at the end. */
1311
1312
  /* So we have at least SOME chars that don't match. */
1313
  /* Now, do the difficult bit in the middle.*/
1314
  /* items[start..end] have to be sent with actualtext[start..z] */
1315
0
  for (i = start; i < end; i++)
1316
0
  {
1317
0
    fz_text_item *item = &span->items[i];
1318
0
    int rune = -1;
1319
1320
0
    if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1321
0
    {
1322
0
      fz_rect r = current_clip(ctx, dev);
1323
0
      if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
1324
0
      {
1325
0
        dev->last.clipped = 1;
1326
0
        continue;
1327
0
      }
1328
0
    }
1329
0
    dev->last.clipped = 0;
1330
1331
0
    if ((size_t)i < z)
1332
0
      actualtext += fz_chartorune(&rune, actualtext);
1333
1334
    /* Calculate new pen location and delta */
1335
0
    tm.e = item->x;
1336
0
    tm.f = item->y;
1337
0
    dev->last.trm = fz_concat(tm, ctm);
1338
0
    dev->last.bidi_level = span->bidi_level;
1339
0
    dev->last.wmode = span->wmode;
1340
0
    if (font != dev->last.font)
1341
0
    {
1342
0
      fz_drop_font(ctx, dev->last.font);
1343
0
      dev->last.font = fz_keep_font(ctx, font);
1344
0
    }
1345
0
    dev->last.valid = 1;
1346
0
    dev->last.flags = flags;
1347
1348
    /* Calculate bounding box and new pen position based on font metrics */
1349
0
    if (item->gid >= 0)
1350
0
      adv = item->adv;
1351
0
    else
1352
0
      adv = 0;
1353
1354
0
    fz_add_stext_char(ctx, dev, font,
1355
0
      rune,
1356
0
      span->items[i].gid,
1357
0
      dev->last.trm,
1358
0
      adv,
1359
0
      dev->last.wmode,
1360
0
      dev->last.bidi_level,
1361
0
      (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1362
0
      flags);
1363
0
  }
1364
1365
  /* If we haven't spotted a postfix by this point, then don't force ourselves to output
1366
   * any more of the actualtext at this point. We might get a new text object that matches
1367
   * more of it. */
1368
0
  if (end == span->len)
1369
0
  {
1370
    /* Shorten actualtext and exit. */
1371
0
    z = strlen(actualtext)+1;
1372
0
    memmove(mt->text, actualtext, z);
1373
0
    return;
1374
0
  }
1375
1376
  /* if this is the first text on the page, and the actual text suffix matches the entire
1377
   * span text, then no font will have been set above, so set the last used font to the
1378
   * span font since flush_actualtext() assumes that a font has been set.
1379
   */
1380
0
  if (!dev->last.font)
1381
0
    dev->last.font = fz_keep_font(ctx, font);
1382
1383
  /* We found a matching postfix. It seems likely that this is going to be the only
1384
   * text object we get, so send any remaining actualtext now. */
1385
0
  flush_actualtext(ctx, dev, actualtext, i, i + (int)strlen(actualtext) - (span->len - end), 0);
1386
1387
  /* Send the postfix */
1388
0
  if (end != span->len)
1389
0
    do_extract(ctx, dev, span, ctm, end, span->len, flags);
1390
1391
0
  mt->text[0] = 0;
1392
0
}
1393
1394
static void
1395
fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags)
1396
0
{
1397
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1398
0
  metatext_t *mt = NULL;
1399
1400
0
  if (span->len == 0)
1401
0
    return;
1402
1403
  /* Are we in an actualtext? */
1404
0
  if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT))
1405
0
    mt = find_actualtext(dev);
1406
1407
0
  if (mt)
1408
0
    do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags);
1409
0
  else
1410
0
    do_extract(ctx, dev, span, ctm, 0, span->len, flags);
1411
0
}
1412
1413
static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha)
1414
0
{
1415
0
  float rgb[3];
1416
0
  fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
1417
0
  return
1418
0
    (((uint32_t) fz_clampi(alpha * 255 + 0.5f, 0, 255)) << 24) |
1419
0
    (((uint32_t) fz_clampi(rgb[0] * 255 + 0.5f, 0, 255)) << 16) |
1420
0
    (((uint32_t) fz_clampi(rgb[1] * 255 + 0.5f, 0, 255)) << 8) |
1421
0
    (((uint32_t) fz_clampi(rgb[2] * 255 + 0.5f, 0, 255)));
1422
0
}
1423
1424
static void
1425
fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
1426
  fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1427
0
{
1428
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1429
0
  fz_text_span *span;
1430
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1431
0
    return;
1432
0
  tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1433
0
  tdev->new_obj = 1;
1434
0
  for (span = text->head; span; span = span->next)
1435
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED);
1436
0
  fz_drop_text(ctx, tdev->lasttext);
1437
0
  tdev->lasttext = fz_keep_text(ctx, text);
1438
0
}
1439
1440
static void
1441
fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
1442
  fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1443
0
{
1444
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1445
0
  fz_text_span *span;
1446
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1447
0
    return;
1448
0
  tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1449
0
  tdev->new_obj = 1;
1450
0
  for (span = text->head; span; span = span->next)
1451
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED);
1452
0
  fz_drop_text(ctx, tdev->lasttext);
1453
0
  tdev->lasttext = fz_keep_text(ctx, text);
1454
0
}
1455
1456
static void
1457
fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
1458
0
{
1459
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1460
0
  fz_text_span *span;
1461
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1462
0
    return;
1463
0
  tdev->color = 0;
1464
0
  tdev->new_obj = 1;
1465
0
  for (span = text->head; span; span = span->next)
1466
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED);
1467
0
  fz_drop_text(ctx, tdev->lasttext);
1468
0
  tdev->lasttext = fz_keep_text(ctx, text);
1469
0
}
1470
1471
static void
1472
fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
1473
0
{
1474
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1475
0
  fz_text_span *span;
1476
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1477
0
    return;
1478
0
  tdev->color = 0;
1479
0
  tdev->new_obj = 1;
1480
0
  for (span = text->head; span; span = span->next)
1481
0
    fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED);
1482
0
  fz_drop_text(ctx, tdev->lasttext);
1483
0
  tdev->lasttext = fz_keep_text(ctx, text);
1484
0
}
1485
1486
static void
1487
fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
1488
0
{
1489
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1490
0
  fz_text_span *span;
1491
0
  if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1492
0
    return;
1493
0
  tdev->color = 0;
1494
0
  tdev->new_obj = 1;
1495
0
  for (span = text->head; span; span = span->next)
1496
0
    fz_stext_extract(ctx, tdev, span, ctm, 0);
1497
0
  fz_drop_text(ctx, tdev->lasttext);
1498
0
  tdev->lasttext = fz_keep_text(ctx, text);
1499
0
}
1500
1501
static void
1502
fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text)
1503
0
{
1504
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1505
0
  metatext_t *mt = find_actualtext(tdev);
1506
0
  char *new_text = NULL;
1507
1508
0
  if (mt != NULL && meta == FZ_METATEXT_ACTUALTEXT)
1509
0
    flush_actualtext(ctx, tdev, mt->text, 0, -1, 0);
1510
1511
0
  if (meta == FZ_METATEXT_ACTUALTEXT)
1512
0
    tdev->last.valid = 0;
1513
1514
0
  new_text = text ? fz_strdup(ctx, text) : NULL;
1515
1516
0
  fz_try(ctx)
1517
0
  {
1518
0
    mt = fz_malloc_struct(ctx, metatext_t);
1519
1520
0
    mt->prev = tdev->metatext;
1521
0
    tdev->metatext = mt;
1522
0
    mt->type = meta;
1523
0
    mt->text = new_text;
1524
0
    mt->bounds = fz_empty_rect;
1525
0
  }
1526
0
  fz_catch(ctx)
1527
0
  {
1528
0
    fz_free(ctx, new_text);
1529
0
    fz_rethrow(ctx);
1530
0
  }
1531
0
}
1532
1533
static void
1534
pop_metatext(fz_context *ctx, fz_stext_device *dev)
1535
0
{
1536
0
  metatext_t *prev;
1537
0
  fz_rect bounds;
1538
1539
0
  if (!dev->metatext)
1540
0
    return;
1541
1542
0
  prev = dev->metatext->prev;
1543
0
  bounds = dev->metatext->bounds;
1544
0
  fz_free(ctx, dev->metatext->text);
1545
0
  fz_free(ctx, dev->metatext);
1546
0
  dev->metatext = prev;
1547
0
  if (prev)
1548
0
    prev->bounds = fz_union_rect(prev->bounds, bounds);
1549
0
}
1550
1551
static void
1552
fz_stext_end_metatext(fz_context *ctx, fz_device *dev)
1553
0
{
1554
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1555
0
  fz_font *myfont = NULL;
1556
1557
0
  if (!tdev->metatext)
1558
0
    return; /* Mismatched pop. Live with it. */
1559
1560
0
  if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT || (tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0)
1561
0
  {
1562
    /* We only deal with ActualText here. Just pop anything else off,
1563
     * and we're done. */
1564
0
    pop_metatext(ctx, tdev);
1565
0
    return;
1566
0
  }
1567
1568
  /* If we have a 'last' text position, send the content after that. */
1569
0
  if (tdev->last.valid)
1570
0
  {
1571
0
    flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1, 0);
1572
0
    pop_metatext(ctx, tdev);
1573
0
    tdev->last.valid = 0;
1574
0
    return;
1575
0
  }
1576
1577
  /* Unless we have collected a rectangle for content that encloses the actual text,
1578
   * we can't do anything. */
1579
0
  if (fz_is_empty_rect(tdev->metatext->bounds))
1580
0
  {
1581
0
    if ((dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) == 0 && tdev->metatext->text[0])
1582
0
      fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned.");
1583
0
    pop_metatext(ctx, tdev);
1584
0
    return;
1585
0
  }
1586
1587
  /* We have a rectangle, so send the text to fill that. */
1588
0
  tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0;
1589
0
  tdev->last.trm.b = 0;
1590
0
  tdev->last.trm.c = 0;
1591
0
  tdev->last.trm.d = tdev->metatext->bounds.y0 - tdev->metatext->bounds.y1;
1592
0
  tdev->last.trm.e = tdev->metatext->bounds.x0;
1593
0
  tdev->last.trm.f = tdev->metatext->bounds.y1;
1594
0
  tdev->last.valid = 1;
1595
1596
0
  fz_var(myfont);
1597
1598
0
  fz_try(ctx)
1599
0
  {
1600
0
    if (tdev->last.font == NULL)
1601
0
    {
1602
0
      myfont = fz_new_base14_font(ctx, "Helvetica");
1603
0
      tdev->last.font = myfont;
1604
0
    }
1605
0
    flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1, 1);
1606
0
    pop_metatext(ctx, tdev);
1607
0
  }
1608
0
  fz_always(ctx)
1609
0
  {
1610
0
    if (myfont)
1611
0
    {
1612
0
      tdev->last.font = NULL;
1613
0
      fz_drop_font(ctx, myfont);
1614
0
    }
1615
0
  }
1616
0
  fz_catch(ctx)
1617
0
    fz_rethrow(ctx);
1618
0
}
1619
1620
1621
/* Images and shadings */
1622
1623
static void
1624
fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
1625
0
{
1626
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1627
0
  fz_rect *bounds = actualtext_bounds(tdev);
1628
1629
  /* If there is an actualtext in force, update its bounds. */
1630
0
  if (bounds)
1631
0
  {
1632
0
    static const fz_rect unit = { 0, 0, 1, 1 };
1633
0
    *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm));
1634
0
  }
1635
1636
  /* Unless we are being told to preserve images, nothing to do here. */
1637
0
  if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1638
0
    return;
1639
1640
  /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
1641
0
  if (alpha >= 0.5f)
1642
0
  {
1643
0
    fz_stext_block *block;
1644
0
    flush_lazy_vectors(ctx, tdev->page, tdev);
1645
0
    block = add_image_block_to_page(ctx, tdev->page, ctm, img, tdev->id);
1646
0
    if (tdev->opts.flags & FZ_STEXT_CLIP)
1647
0
    {
1648
0
      fz_rect clip = fz_device_current_scissor(ctx, dev);
1649
0
      clip = fz_intersect_rect(clip, tdev->page->mediabox);
1650
0
      block->bbox = fz_intersect_rect(block->bbox, clip);
1651
0
    }
1652
0
  }
1653
0
}
1654
1655
static void
1656
fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
1657
    fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
1658
0
{
1659
0
  fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
1660
0
}
1661
1662
static fz_image *
1663
fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
1664
0
{
1665
0
  fz_matrix ctm = *in_out_ctm;
1666
0
  fz_pixmap *pix;
1667
0
  fz_image *img = NULL;
1668
0
  fz_rect bounds;
1669
0
  fz_irect bbox;
1670
1671
0
  bounds = fz_bound_shade(ctx, shade, ctm);
1672
0
  bounds = fz_intersect_rect(bounds, scissor);
1673
0
  bbox = fz_irect_from_rect(bounds);
1674
1675
0
  pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
1676
0
  fz_try(ctx)
1677
0
  {
1678
0
    if (shade->use_background)
1679
0
      fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
1680
0
    else
1681
0
      fz_clear_pixmap(ctx, pix);
1682
0
    fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL);
1683
0
    img = fz_new_image_from_pixmap(ctx, pix, NULL);
1684
0
  }
1685
0
  fz_always(ctx)
1686
0
    fz_drop_pixmap(ctx, pix);
1687
0
  fz_catch(ctx)
1688
0
    fz_rethrow(ctx);
1689
1690
0
  in_out_ctm->a = pix->w;
1691
0
  in_out_ctm->b = 0;
1692
0
  in_out_ctm->c = 0;
1693
0
  in_out_ctm->d = pix->h;
1694
0
  in_out_ctm->e = pix->x;
1695
0
  in_out_ctm->f = pix->y;
1696
0
  return img;
1697
0
}
1698
1699
static void
1700
fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
1701
0
{
1702
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1703
0
  fz_matrix local_ctm;
1704
0
  fz_rect scissor;
1705
0
  fz_image *image;
1706
1707
  /* If we aren't preserving images, don't waste time making the shade. */
1708
0
  if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1709
0
  {
1710
    /* But we do still need to handle actualtext bounds. */
1711
0
    fz_rect *bounds = actualtext_bounds(tdev);
1712
0
    if (bounds)
1713
0
      *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm));
1714
0
    return;
1715
0
  }
1716
1717
0
  local_ctm = ctm;
1718
0
  scissor = fz_device_current_scissor(ctx, dev);
1719
0
  if (dev->flags & FZ_STEXT_CLIP_RECT)
1720
0
    scissor = fz_intersect_rect(scissor, tdev->opts.clip);
1721
0
  scissor = fz_intersect_rect(scissor, tdev->page->mediabox);
1722
0
  image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
1723
0
  fz_try(ctx)
1724
0
    fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
1725
0
  fz_always(ctx)
1726
0
    fz_drop_image(ctx, image);
1727
0
  fz_catch(ctx)
1728
0
    fz_rethrow(ctx);
1729
0
}
1730
1731
static void
1732
fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block)
1733
0
{
1734
0
  fz_stext_line *line;
1735
0
  fz_stext_char *ch;
1736
1737
0
  for ( ; block != NULL; block = block->next)
1738
0
  {
1739
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
1740
0
      if (block->u.s.down)
1741
0
        fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block);
1742
0
    if (block->type != FZ_STEXT_BLOCK_TEXT)
1743
0
      continue;
1744
0
    for (line = block->u.t.first_line; line; line = line->next)
1745
0
    {
1746
0
      int reorder = 0;
1747
0
      for (ch = line->first_char; ch; ch = ch->next)
1748
0
      {
1749
0
        fz_rect ch_box = fz_rect_from_quad(ch->quad);
1750
0
        if (ch == line->first_char)
1751
0
          line->bbox = ch_box;
1752
0
        else
1753
0
          line->bbox = fz_union_rect(line->bbox, ch_box);
1754
0
        if (ch->bidi == 3)
1755
0
          reorder = 1;
1756
0
      }
1757
0
      block->bbox = fz_union_rect(block->bbox, line->bbox);
1758
0
      if (reorder)
1759
0
        reverse_bidi_line(line);
1760
0
    }
1761
0
  }
1762
0
}
1763
1764
static void
1765
advance_to_x(fz_point *a, fz_point b, float x)
1766
0
{
1767
0
  a->y += (b.y - a->y) * (x - a->x) / (b.x - a->x);
1768
0
  a->x = x;
1769
0
}
1770
1771
static void
1772
advance_to_y(fz_point *a, fz_point b, float y)
1773
0
{
1774
0
  a->x += (b.x - a->x) * (y - a->y) / (b.y - a->y);
1775
0
  a->y = y;
1776
0
}
1777
1778
static int
1779
line_crosses_rect(fz_point a, fz_point b, fz_rect r)
1780
0
{
1781
  /* Cope with trivial exclusions */
1782
0
  if (a.x < r.x0 && b.x < r.x0)
1783
0
    return 0;
1784
0
  if (a.x > r.x1 && b.x > r.x1)
1785
0
    return 0;
1786
0
  if (a.y < r.y0 && b.y < r.y0)
1787
0
    return 0;
1788
0
  if (a.y > r.y1 && b.y > r.y1)
1789
0
    return 0;
1790
1791
0
  if (a.x < r.x0)
1792
0
    advance_to_x(&a, b, r.x0);
1793
0
  if (a.x > r.x1)
1794
0
    advance_to_x(&a, b, r.x1);
1795
0
  if (a.y < r.y0)
1796
0
    advance_to_y(&a, b, r.y0);
1797
0
  if (a.y > r.y1)
1798
0
    advance_to_y(&a, b, r.y1);
1799
1800
0
  return fz_is_point_inside_rect(a, r);
1801
0
}
1802
1803
static float
1804
calculate_ascent(fz_point p, fz_point origin, fz_point dir)
1805
0
{
1806
0
  return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x);
1807
0
}
1808
1809
/* Create us a rect from the given quad, but extend it downwards
1810
 * to allow for underlines that pass under the glyphs. */
1811
static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size)
1812
0
{
1813
  /* Consider the two rects from A and g respectively.
1814
   *
1815
   * ul +------+ ur   or
1816
   *    |  /\  |         ul +------+ ur
1817
   *    | /__\ |            | /''\ |
1818
   *    |/    \|            |(    ||
1819
   * ll +------+ lr         | ''''||
1820
   *                        |  ''' | <-expected underline level
1821
   *                     ll +------+ lr
1822
   *
1823
   * So an underline won't cross A's rect, but will cross g's.
1824
   * We want to make a rect that includes a suitable amount of
1825
   * space underneath. The information we have available to us
1826
   * is summed up here:
1827
   *
1828
   *  ul +---------+ ur
1829
   *     |         |
1830
   *     | origin  |
1831
   *     |+----------> dir
1832
   *     |         |
1833
   *  ll +---------+ lr
1834
   *
1835
   * Consider the distance from ul to the line that passes through
1836
   * the origin with direction dir. Similarly, consider the distance
1837
   * from ur to the same line. This can be thought of as the 'ascent'
1838
   * of this character.
1839
   *
1840
   * We'd like the distance from ul to ll to be greater than this, so
1841
   * as to ensure we cover the possible location where an underline
1842
   * might reasonably go.
1843
   *
1844
   * If we have a line (l) through point A with direction vector u,
1845
   * the distance between point P and line(l) is:
1846
   *
1847
   * d(P,l) = || AP x u || / || u ||
1848
   *
1849
   * where x is the cross product.
1850
   *
1851
   * For us, because || dir || = 1:
1852
   *
1853
   * d(ul, origin) = || (origin-ul) x dir ||
1854
   *
1855
   * The cross product is only defined in 3 (or 7!) dimensions, so
1856
   * extend both vectors into 3d by defining a 0 z component.
1857
   *
1858
   * (origin-ul) x dir = [ (origin.y - ul.y) . 0     - 0                 . dir.y ]
1859
   *                     [ 0                 . dir.x - (origin.x - ul.y) . 0     ]
1860
   *                     [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ]
1861
   *
1862
   * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x
1863
   */
1864
0
  float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2;
1865
0
  fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y };
1866
0
  fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y };
1867
0
  float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2;
1868
0
  int neg = 0;
1869
0
  float extra_rise = 0;
1870
1871
  /* Spaces will have 0 ascent. underscores will have small ascent.
1872
   * We want a sane ascent to be able to spot strikeouts, but not
1873
   * so big that it incorporates lines above the text, like borders. */
1874
0
  if (ascent < 0.75*size)
1875
0
    extra_rise = 0.75*size - ascent;
1876
1877
  /* We'd like height to be at least ascent + 1/4 size */
1878
0
  if (height < 0)
1879
0
    neg = 1, height = -height;
1880
0
  if (height < ascent + size * 0.25f)
1881
0
    height = ascent + size * 0.25f;
1882
1883
0
  height -= ascent;
1884
0
  if (neg)
1885
0
    height = -height;
1886
0
  quad.ll.x += - height * dir.y;
1887
0
  quad.ll.y +=   height * dir.x;
1888
0
  quad.lr.x += - height * dir.y;
1889
0
  quad.lr.y +=   height * dir.x;
1890
0
  quad.ul.x -= - extra_rise * dir.y;
1891
0
  quad.ul.y -=   extra_rise * dir.x;
1892
0
  quad.ur.x -= - extra_rise * dir.y;
1893
0
  quad.ur.y -=   extra_rise * dir.x;
1894
1895
0
  return fz_rect_from_quad(quad);
1896
0
}
1897
1898
static int feq(float a,float b)
1899
0
{
1900
0
#define EPSILON 0.00001
1901
0
  a -= b;
1902
0
  if (a < 0)
1903
0
    a = -a;
1904
0
  return a < EPSILON;
1905
0
}
1906
1907
static void
1908
check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir, float thickness)
1909
0
{
1910
0
  for ( ; block; block = block->next)
1911
0
  {
1912
0
    fz_stext_line *line;
1913
1914
0
    if (block->type != FZ_STEXT_BLOCK_TEXT)
1915
0
      continue;
1916
1917
0
    for (line = block->u.t.first_line; line != NULL; line = line->next)
1918
0
    {
1919
0
      fz_stext_char *ch;
1920
1921
0
      if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) &&
1922
0
        (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y)))
1923
0
        continue;
1924
1925
      /* Matching directions... */
1926
1927
      /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
1928
       * chars. - FIXME: Now we do! */
1929
0
      for (ch = line->first_char; ch; ch = ch->next)
1930
0
      {
1931
0
        fz_point up;
1932
0
        float dx, dy, dot;
1933
0
        fz_rect ch_box;
1934
1935
        /* If the thickness is more than a 1/4 of the size, it's a highlight, not a
1936
         * line! */
1937
0
        if (ch->size < thickness*4)
1938
0
          continue;
1939
1940
0
        ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size);
1941
1942
0
        if (!line_crosses_rect(from, to, ch_box))
1943
0
          continue;
1944
1945
        /* Is this a strikeout or an underline? */
1946
1947
        /* The baseline moves from ch->origin in the direction line->dir */
1948
0
        up.x = line->dir.y;
1949
0
        up.y = -line->dir.x;
1950
1951
        /* How far is our line displaced from the line through the origin? */
1952
0
        dx = from.x - ch->origin.x;
1953
0
        dy = from.y - ch->origin.y;
1954
        /* Dot product with up. up is normalised */
1955
0
        dot = dx * up.x + dy * up.y;
1956
1957
0
        if (dot > 0 && dot <= 0.8f * ch->font->ascender * ch->size)
1958
0
          ch->flags |= FZ_STEXT_STRIKEOUT;
1959
0
        else
1960
0
          ch->flags |= FZ_STEXT_UNDERLINE;
1961
0
      }
1962
0
    }
1963
0
  }
1964
0
}
1965
1966
static void
1967
check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page)
1968
0
{
1969
0
  int i, n = tdev->rect_len;
1970
1971
0
  for (i = 0; i < n; i++)
1972
0
  {
1973
0
    fz_point from = tdev->rects[i].from;
1974
0
    fz_point to = tdev->rects[i].to;
1975
0
    float thickness = tdev->rects[i].thickness;
1976
0
    fz_point dir;
1977
0
    dir.x = to.x - from.x;
1978
0
    dir.y = to.y - from.y;
1979
0
    dir = fz_normalize_vector(dir);
1980
1981
0
    check_strikeout(ctx, page->first_block, from, to, dir, thickness);
1982
0
  }
1983
0
}
1984
1985
static void
1986
fz_stext_close_device(fz_context *ctx, fz_device *dev)
1987
0
{
1988
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
1989
0
  fz_stext_page *page = tdev->page;
1990
1991
0
  if ((tdev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(tdev->lastchar) && tdev->lastline != NULL)
1992
0
    tdev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED;
1993
1994
0
  flush_lazy_vectors(ctx, page, tdev);
1995
1996
0
  fixup_bboxes_and_bidi(ctx, page->first_block);
1997
1998
0
  if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES)
1999
0
    check_rects_for_strikeout(ctx, tdev, page);
2000
2001
  /* TODO: smart sorting of blocks and lines in reading order */
2002
  /* TODO: unicode NFC normalization */
2003
2004
0
  if (tdev->opts.flags & FZ_STEXT_SEGMENT)
2005
0
    fz_segment_stext_page(ctx, page);
2006
2007
0
  if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK)
2008
0
    fz_paragraph_break(ctx, page);
2009
2010
0
  if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT)
2011
0
    fz_table_hunt(ctx, page);
2012
0
}
2013
2014
static void
2015
fz_stext_drop_device(fz_context *ctx, fz_device *dev)
2016
0
{
2017
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2018
0
  fz_drop_text(ctx, tdev->lasttext);
2019
0
  fz_drop_font(ctx, tdev->last.font);
2020
0
  while (tdev->metatext)
2021
0
    pop_metatext(ctx, tdev);
2022
2023
0
  fz_free(ctx, tdev->rects);
2024
0
}
2025
2026
static int
2027
val_is_rect(const char *val, fz_rect *rp)
2028
0
{
2029
0
  fz_rect r;
2030
0
  const char *s;
2031
2032
0
  s = strchr(val, ':');
2033
0
  if (s == NULL || s == val)
2034
0
    return 0;
2035
0
  r.x0 = fz_atof(val);
2036
0
  val = s+1;
2037
0
  s = strchr(val, ':');
2038
0
  if (s == NULL || s == val)
2039
0
    return 0;
2040
0
  r.y0 = fz_atof(val);
2041
0
  val = s+1;
2042
0
  s = strchr(val, ':');
2043
0
  if (s == NULL || s == val)
2044
0
    return 0;
2045
0
  r.x1 = fz_atof(val);
2046
0
  val = s+1;
2047
0
  r.y1 = fz_atof(val);
2048
2049
0
  *rp = r;
2050
2051
0
  return 1;
2052
0
}
2053
2054
void fz_init_stext_options(fz_context *ctx, fz_stext_options *opts)
2055
0
{
2056
0
  memset(opts, 0, sizeof *opts);
2057
2058
0
  opts->flags |= FZ_STEXT_CLIP;
2059
0
  opts->scale = 1;
2060
0
}
2061
2062
fz_stext_options *
2063
fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
2064
0
{
2065
0
  fz_options *options = fz_new_options(ctx, string);
2066
0
  fz_try(ctx)
2067
0
  {
2068
0
    fz_init_stext_options(ctx, opts);
2069
0
    fz_apply_stext_options(ctx, opts, options);
2070
0
    fz_throw_on_unused_options(ctx, options, "stext");
2071
0
  }
2072
0
  fz_always(ctx)
2073
0
    fz_drop_options(ctx, options);
2074
0
  fz_catch(ctx)
2075
0
    fz_rethrow(ctx);
2076
0
  return opts;
2077
0
}
2078
2079
#define SETCLEARBOOL(A, B, C) \
2080
0
 (A) = (B) ? ((A) | (C)) : ((A) & ~(C))
2081
2082
void
2083
fz_apply_stext_options(fz_context *ctx, fz_stext_options *opts, fz_options *string)
2084
0
{
2085
0
  const char *val;
2086
0
  float x;
2087
0
  int b;
2088
2089
  /* when adding options, remember to update fz_stext_options_usage above */
2090
2091
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-ligatures", &b))
2092
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_LIGATURES);
2093
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-whitespace", &b))
2094
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_WHITESPACE);
2095
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-images", &b))
2096
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_IMAGES);
2097
0
  if (fz_lookup_option_boolean(ctx, string, "inhibit-spaces", &b))
2098
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_INHIBIT_SPACES);
2099
0
  if (fz_lookup_option_boolean(ctx, string, "dehyphenate", &b))
2100
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_DEHYPHENATE);
2101
0
  if (fz_lookup_option_boolean(ctx, string, "preserve-spans", &b))
2102
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_SPANS);
2103
0
  if (fz_lookup_option_boolean(ctx, string, "structured", &b))
2104
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STRUCTURE);
2105
0
  if (fz_lookup_option_boolean(ctx, string, "use-cid-for-unknown-unicode", &b))
2106
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE);
2107
0
  if (fz_lookup_option_boolean(ctx, string, "use-gid-for-unknown-unicode", &b))
2108
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE);
2109
0
  if (fz_lookup_option_boolean(ctx, string, "accurate-bboxes", &b))
2110
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_BBOXES);
2111
0
  if (fz_lookup_option_boolean(ctx, string, "vectors", &b))
2112
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_VECTORS);
2113
0
  if (fz_lookup_option_boolean(ctx, string, "lazy-vectors", &b))
2114
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_LAZY_VECTORS);
2115
0
  if (fz_lookup_option_boolean(ctx, string, "fuzzy-vectors", &b))
2116
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_FUZZY_VECTORS);
2117
0
  if (fz_lookup_option_boolean(ctx, string, "ignore-actualtext", &b))
2118
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_IGNORE_ACTUALTEXT);
2119
0
  if (fz_lookup_option_boolean(ctx, string, "segment", &b))
2120
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_SEGMENT);
2121
0
  if (fz_lookup_option_boolean(ctx, string, "paragraph-break", &b))
2122
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_PARAGRAPH_BREAK);
2123
0
  if (fz_lookup_option_boolean(ctx, string, "table-hunt", &b))
2124
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_TABLE_HUNT);
2125
0
  if (fz_lookup_option_boolean(ctx, string, "collect-styles", &b))
2126
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STYLES);
2127
0
  if (fz_lookup_option_boolean(ctx, string, "accurate-ascenders", &b))
2128
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_ASCENDERS);
2129
0
  if (fz_lookup_option_boolean(ctx, string, "accurate-side-bearings", &b))
2130
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_SIDE_BEARINGS);
2131
2132
0
  if (fz_lookup_option_boolean(ctx, string, "mediabox-clip", &b))
2133
0
  {
2134
0
    fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead.");
2135
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP);
2136
0
  }
2137
0
  if (fz_lookup_option_boolean(ctx, string, "clip", &b))
2138
0
    SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP);
2139
2140
0
  if (fz_lookup_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip))
2141
0
    opts->flags |= FZ_STEXT_CLIP_RECT;
2142
2143
0
  if (fz_lookup_option_float(ctx, string, "resolution", &x))
2144
0
    opts->scale = x / 96.0f; /* HTML base resolution is 96ppi */
2145
2146
0
  fz_validate_options(ctx, string, "stext");
2147
0
}
2148
2149
typedef struct
2150
{
2151
  int fail;
2152
  int count;
2153
  fz_point corners[4];
2154
} is_rect_data;
2155
2156
static void
2157
stash_point(is_rect_data *rd, float x, float y)
2158
0
{
2159
0
  if (rd->count > 3)
2160
0
  {
2161
0
    rd->fail = 1;
2162
0
    return;
2163
0
  }
2164
2165
0
  rd->corners[rd->count].x = x;
2166
0
  rd->corners[rd->count].y = y;
2167
0
  rd->count++;
2168
0
}
2169
2170
static void
2171
is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
2172
0
{
2173
0
  is_rect_data *rd = arg;
2174
0
  if (rd->fail)
2175
0
    return;
2176
2177
0
  if (rd->count != 0)
2178
0
  {
2179
0
    rd->fail = 1;
2180
0
    return;
2181
0
  }
2182
0
  stash_point(rd, x, y);
2183
0
}
2184
2185
static void
2186
is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
2187
0
{
2188
0
  is_rect_data *rd = arg;
2189
0
  if (rd->fail)
2190
0
    return;
2191
2192
0
  if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
2193
0
    return;
2194
2195
0
  stash_point(rd, x, y);
2196
0
}
2197
2198
static void
2199
is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
2200
0
{
2201
0
  is_rect_data *rd = arg;
2202
0
  rd->fail = 1;
2203
0
}
2204
2205
static void
2206
is_rect_closepath(fz_context *ctx, void *arg)
2207
0
{
2208
0
  is_rect_data *rd = arg;
2209
0
  if (rd->fail)
2210
0
    return;
2211
0
  if (rd->count == 3)
2212
0
    stash_point(rd, rd->corners[0].x, rd->corners[0].y);
2213
0
  if (rd->count != 4)
2214
0
    rd->fail = 1;
2215
0
}
2216
2217
static int
2218
is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm)
2219
0
{
2220
0
  float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
2221
0
  is_rect_data rd = { 0 };
2222
0
  static const fz_path_walker walker =
2223
0
  {
2224
0
    is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
2225
0
  };
2226
0
  int i;
2227
2228
0
  fz_walk_path(ctx, path, &walker, &rd);
2229
2230
0
  if (rd.fail)
2231
0
    return 0;
2232
2233
0
  if (rd.count == 2)
2234
0
  {
2235
0
    stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
2236
0
    stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
2237
0
  }
2238
2239
0
  for (i = 0 ; i < 4; i++)
2240
0
  {
2241
0
    fz_point p = fz_transform_point(rd.corners[i], ctm);
2242
2243
0
    rd.corners[i].x = p.x;
2244
0
    rd.corners[i].y = p.y;
2245
0
  }
2246
2247
  /* So we have a 4 cornered path. Hopefully something like:
2248
   * 0---------1
2249
   * |         |
2250
   * 3---------2
2251
   * but it might be:
2252
   * 0---------3
2253
   * |         |
2254
   * 1---------2
2255
  */
2256
0
  while (1)
2257
0
  {
2258
0
    d01x = rd.corners[1].x - rd.corners[0].x;
2259
0
    d01y = rd.corners[1].y - rd.corners[0].y;
2260
0
    d01 = d01x * d01x + d01y * d01y;
2261
0
    d03x = rd.corners[3].x - rd.corners[0].x;
2262
0
    d03y = rd.corners[3].y - rd.corners[0].y;
2263
0
    d03 = d03x * d03x + d03y * d03y;
2264
0
    if(d01 < d03)
2265
0
    {
2266
      /* We are the latter case. Transpose it. */
2267
0
      fz_point p = rd.corners[1];
2268
0
      rd.corners[1] = rd.corners[3];
2269
0
      rd.corners[3] = p;
2270
0
    }
2271
0
    else
2272
0
      break;
2273
0
  }
2274
0
  d32x = rd.corners[2].x - rd.corners[3].x;
2275
0
  d32y = rd.corners[2].y - rd.corners[3].y;
2276
2277
  /* So d32x and d01x need to be the same for this to be a strikeout. */
2278
0
  if (!feq(d32x, d01x) || !feq(d32y, d01y))
2279
0
    return 0;
2280
2281
  /* We are plausibly a rectangle. */
2282
0
  *thickness = sqrtf(d03x * d03x + d03y * d03y);
2283
2284
0
  from->x = (rd.corners[0].x + rd.corners[3].x)/2;
2285
0
  from->y = (rd.corners[0].y + rd.corners[3].y)/2;
2286
0
  to->x = (rd.corners[1].x + rd.corners[2].x)/2;
2287
0
  to->y = (rd.corners[1].y + rd.corners[2].y)/2;
2288
2289
0
  return 1;
2290
0
}
2291
2292
static void
2293
check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm)
2294
0
{
2295
0
  float thickness;
2296
0
  fz_point from, to;
2297
2298
  /* Is this path a thin rectangle (possibly rotated)? If so, then we need to
2299
   * consider it as being a strikeout or underline. */
2300
0
  if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm))
2301
0
    return;
2302
2303
  /* Add to the list of rects in the device. */
2304
0
  if (tdev->rect_len == tdev->rect_max)
2305
0
  {
2306
0
    int newmax = tdev->rect_max * 2;
2307
0
    if (newmax == 0)
2308
0
      newmax = 32;
2309
2310
0
    tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax);
2311
0
    tdev->rect_max = newmax;
2312
0
  }
2313
0
  tdev->rects[tdev->rect_len].from = from;
2314
0
  tdev->rects[tdev->rect_len].to = to;
2315
0
  tdev->rects[tdev->rect_len].thickness = thickness;
2316
0
  tdev->rect_len++;
2317
0
}
2318
2319
static void
2320
add_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, fz_rect bbox, uint32_t flags, uint32_t argb, int id, float exp)
2321
0
{
2322
0
  fz_stext_block *b;
2323
2324
0
  if (exp != 0)
2325
0
  {
2326
0
    bbox.x0 -= exp;
2327
0
    bbox.y0 -= exp;
2328
0
    bbox.x1 += exp;
2329
0
    bbox.y1 += exp;
2330
0
  }
2331
2332
0
  if (tdev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP))
2333
0
  {
2334
0
    fz_rect r = current_clip(ctx, tdev);
2335
0
    bbox = fz_intersect_rect(bbox, r);
2336
0
    if (!fz_is_valid_rect(bbox))
2337
0
      return;
2338
0
  }
2339
2340
  /* Can we just add this one onto the previous one? */
2341
  /* Only if it's a small rectangle... */
2342
0
  if ((flags & FZ_STEXT_VECTOR_IS_RECTANGLE) && bbox.x1 - bbox.x0 <= 2 && bbox.y1 - bbox.y0 <= 2)
2343
0
  {
2344
0
    fz_stext_block *prev;
2345
    /* Find b = the previous block. */
2346
0
    if (tdev->flags & FZ_STEXT_LAZY_VECTORS)
2347
0
      b = tdev->lazy_vectors_tail;
2348
0
    else if (page->last_struct)
2349
0
      b = page->last_struct->last_block;
2350
0
    else
2351
0
      b = page->last_block;
2352
2353
0
    if (b && b->type == FZ_STEXT_BLOCK_VECTOR && b->u.v.argb == argb && b->u.v.flags == flags)
2354
0
    {
2355
      /* Maybe we can join it? */
2356
0
      float fudge = 0.001f;
2357
0
      if (b->bbox.x0 == bbox.x0 && b->bbox.x1 == bbox.x1 && b->bbox.y1 + fudge >= bbox.y0 && b->bbox.y0 - fudge <= bbox.y1)
2358
0
      {
2359
        /* Stacks vertically. */
2360
0
        b->bbox.y0 = fz_min(b->bbox.y0, bbox.y0);
2361
0
        b->bbox.y1 = fz_max(b->bbox.y1, bbox.y1);
2362
0
        return;
2363
0
      }
2364
0
      else if (b->bbox.y0 == bbox.y0 && b->bbox.y1 == bbox.y1 && b->bbox.x1 + fudge >= bbox.x0 && b->bbox.x0 - fudge <= bbox.x1)
2365
0
      {
2366
        /* Stacks horizontally. */
2367
0
        b->bbox.x0 = fz_min(b->bbox.x0, bbox.x0);
2368
0
        b->bbox.x1 = fz_max(b->bbox.x1, bbox.x1);
2369
0
        return;
2370
0
      }
2371
2372
      /* So, we can't add our new vector onto the previous one. But can we merge the 2 previous ones? */
2373
      /* The intent here is that we allow a set of vector 'blocks' to be merged together, perhaps:
2374
       *    ABC
2375
       * Then we allow another set to be merged together, perhaps DE:
2376
       *    ABC
2377
       *    DE
2378
       * Then when we get another block that can't be merged into DE (perhaps F):
2379
       *    ABC
2380
       *    DE
2381
       *    F
2382
       * We'll consider ABC and DE for merging. Whatevever block that F ends up
2383
       * in later (maybe FGH):
2384
       *    ABC
2385
       *    DE
2386
       *    FGH
2387
       * will be considered for merging later. We can always do this "exactly" (if the blocks
2388
       * line up precisely), but to do this 'lossily', we guard it with 'FUZZY_VECTORS'.
2389
       */
2390
0
      prev = b->prev;
2391
0
      while (prev && prev->type == FZ_STEXT_BLOCK_VECTOR && (prev->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE))
2392
0
      {
2393
        /* Lossless merging. */
2394
0
        if (prev->bbox.x0 == b->bbox.x0 && prev->bbox.x1 == b->bbox.x1 && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1)
2395
0
        {
2396
          /* Stacks exactly vertically. Very rarely hit. */
2397
0
          prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0);
2398
0
          prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1);
2399
0
          return;
2400
0
        }
2401
0
        else if (prev->bbox.y0 == b->bbox.y0 && prev->bbox.y1 == b->bbox.y1 && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1)
2402
0
        {
2403
          /* Stacks horizontally.  Very rarely hit. */
2404
0
          prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0);
2405
0
          prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1);
2406
0
          return;
2407
0
        }
2408
0
        if (tdev->flags & FZ_STEXT_FUZZY_VECTORS)
2409
0
        {
2410
          /* Be more forgiving in how we merge vectors */
2411
          /* We need to be careful not to merge together differently oriented borders for table cells.
2412
           *        C
2413
           *        |
2414
           *        v
2415
           *     +-----+-----+
2416
           * A-> |     |     |
2417
           *     +-----+-----+
2418
           * B-> |     |     |
2419
           *     +-----+-----+
2420
           *
2421
           * It'd be fine to merge borders A and B together, because it still signifies the same
2422
           * edges. It would NOT be fine to merge A and C together, because we'd lose the sense
2423
           * of them being borders, and just have a blob that covered the cell.
2424
           * The fudge2 logic below should hopefully allow for this, as well as allowing us to
2425
           * match blocks like:
2426
           *    ABC
2427
           *   DE FG
2428
           *    HIJ
2429
           *   KL MN
2430
           *    OPQ
2431
           */
2432
0
          float fudge2 = 2;
2433
0
          if ((fabsf(prev->bbox.x0 - b->bbox.x0) <= fudge2 || fabsf(prev->bbox.x1 - b->bbox.x1) <= fudge2) && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1)
2434
0
          {
2435
            /* Stacks vertically. */
2436
0
            goto join;
2437
0
          }
2438
0
          else if ((fabsf(prev->bbox.y0 - b->bbox.y0) <= fudge2 || fabsf(prev->bbox.y1 - b->bbox.y1) <= fudge2) && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1)
2439
0
          {
2440
            /* Stacks horizontally. */
2441
0
  join:
2442
0
            prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0);
2443
0
            prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1);
2444
0
            prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0);
2445
0
            prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1);
2446
            /* Unlink b (so, fiddle with b->prev, which is not necessarily prev!) */
2447
0
            b->prev->next = NULL;
2448
0
            if (tdev->flags & FZ_STEXT_LAZY_VECTORS)
2449
0
              tdev->lazy_vectors_tail = b->prev;
2450
0
            else if (page->last_struct)
2451
0
              page->last_struct->last_block = b->prev;
2452
0
            else
2453
0
              page->last_block = b->prev;
2454
0
            break;
2455
0
          }
2456
0
        }
2457
        /* Now, allow for looking further back. */
2458
0
        prev = prev->prev;
2459
0
      }
2460
0
    }
2461
0
  }
2462
2463
0
  if (tdev->flags & FZ_STEXT_LAZY_VECTORS)
2464
0
    b = add_lazy_vector(ctx, page, tdev, id);
2465
0
  else
2466
0
    b = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_VECTOR, id);
2467
2468
0
  b->bbox = bbox;
2469
0
  b->u.v.flags = flags;
2470
0
  b->u.v.argb = argb;
2471
0
}
2472
2473
typedef struct
2474
{
2475
  fz_stext_device *dev;
2476
  fz_matrix ctm;
2477
  uint32_t argb;
2478
  uint32_t flags;
2479
  fz_stext_page *page;
2480
  fz_rect seg_bounds;
2481
  fz_rect leftovers;
2482
  fz_rect pending;
2483
  int count;
2484
  fz_point p[5];
2485
  int id;
2486
  float exp;
2487
} split_path_data;
2488
2489
static void
2490
maybe_rect(fz_context *ctx, split_path_data *sp)
2491
0
{
2492
0
  int rect = 0;
2493
0
  int i;
2494
0
  fz_rect leftovers;
2495
2496
0
  if (sp->count >= 0)
2497
0
  {
2498
0
    if (sp->count == 3)
2499
0
    {
2500
      /* Allow for "moveto A, lineto B, lineto A, close" */
2501
0
      if (feq(sp->p[0].x, sp->p[2].x) || feq(sp->p[0].y, sp->p[2].y))
2502
0
        sp->count = 2;
2503
0
    }
2504
0
    if (sp->count == 2)
2505
0
    {
2506
0
      if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y))
2507
0
        rect = 1; /* Count that as a rect */
2508
0
    }
2509
0
    else if (sp->count == 4 || sp->count == 5)
2510
0
    {
2511
0
      if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y))
2512
0
        rect = 1;
2513
0
      else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y))
2514
0
        rect = 1;
2515
0
    }
2516
0
    if (rect)
2517
0
    {
2518
0
      fz_rect bounds;
2519
2520
0
      bounds.x0 = bounds.x1 = sp->p[0].x;
2521
0
      bounds.y0 = bounds.y1 = sp->p[0].y;
2522
0
      for (i = 1; i < sp->count; i++)
2523
0
        bounds = fz_include_point_in_rect(bounds, sp->p[i]);
2524
0
      if (fz_is_valid_rect(sp->pending))
2525
0
        add_vector(ctx, sp->page, sp->dev, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb, sp->id, sp->exp);
2526
0
      sp->pending = bounds;
2527
0
      return;
2528
0
    }
2529
0
  }
2530
2531
  /* We aren't a rectangle! */
2532
0
  leftovers = sp->seg_bounds;
2533
2534
0
  if (sp->dev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP))
2535
0
    leftovers = fz_intersect_rect(leftovers, current_clip(ctx, sp->dev));
2536
2537
0
  if (fz_is_valid_rect(leftovers))
2538
0
    sp->leftovers = fz_union_rect(sp->leftovers, leftovers);
2539
2540
  /* Remember we're not a rect. */
2541
0
  sp->count = -1;
2542
0
}
2543
2544
static void
2545
split_move(fz_context *ctx, void *arg, float x, float y)
2546
0
{
2547
0
  split_path_data *sp = (split_path_data *)arg;
2548
0
  fz_point p = fz_transform_point_xy(x, y, sp->ctm);
2549
2550
0
  maybe_rect(ctx, sp);
2551
0
  sp->p[0] = p;
2552
0
  sp->count = 1;
2553
0
  sp->seg_bounds.x0 = sp->seg_bounds.x1 = p.x;
2554
0
  sp->seg_bounds.y0 = sp->seg_bounds.y1 = p.y;
2555
0
}
2556
2557
static void
2558
split_line(fz_context *ctx, void *arg, float x, float y)
2559
0
{
2560
0
  split_path_data *sp = (split_path_data *)arg;
2561
0
  fz_point p = fz_transform_point_xy(x, y, sp->ctm);
2562
2563
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, p);
2564
2565
0
  if (sp->count >= 0)
2566
0
  {
2567
    /* Check for lines to the same point. */
2568
0
    if (feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y))
2569
0
      return;
2570
    /* If we're still maybe a rect, just record the point. */
2571
0
    if (sp->count < 4)
2572
0
    {
2573
0
      sp->p[sp->count++] = p;
2574
0
      return;
2575
0
    }
2576
    /* Check for close line? */
2577
0
    if (sp->count == 4)
2578
0
    {
2579
0
      if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y))
2580
0
      {
2581
        /* We've just drawn a line back to the start point. */
2582
        /* Needless saving of point, but it makes the logic
2583
         * easier elsewhere. */
2584
0
        sp->p[sp->count++] = p;
2585
0
        return;
2586
0
      }
2587
0
    }
2588
    /* We can no longer be a rect. */
2589
0
    sp->count = -1;
2590
0
  }
2591
0
}
2592
2593
static void
2594
split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
2595
0
{
2596
0
  split_path_data *sp = (split_path_data *)arg;
2597
2598
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x1, y1, sp->ctm));
2599
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x2, y2, sp->ctm));
2600
0
  sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x3, y3, sp->ctm));
2601
2602
  /* We can no longer be a rect. */
2603
0
  sp->count = -1;
2604
0
}
2605
2606
static void
2607
split_close(fz_context *ctx, void *arg)
2608
0
{
2609
0
  split_path_data *sp = (split_path_data *)arg;
2610
2611
0
  maybe_rect(ctx, sp);
2612
0
  sp->count = 0;
2613
0
}
2614
2615
2616
static const
2617
fz_path_walker split_path_rects =
2618
{
2619
  split_move,
2620
  split_line,
2621
  split_curve,
2622
  split_close
2623
};
2624
2625
static void
2626
add_vectors_from_path(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, int stroke, float exp)
2627
0
{
2628
0
  int have_leftovers;
2629
0
  split_path_data sp;
2630
0
  int id = tdev->id;
2631
2632
0
  sp.dev = tdev;
2633
0
  sp.ctm = ctm;
2634
0
  sp.argb = hexrgba_from_color(ctx, cs, color, alpha);
2635
0
  sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0;
2636
0
  sp.page = page;
2637
0
  sp.count = 0;
2638
0
  sp.leftovers = fz_empty_rect;
2639
0
  sp.seg_bounds = fz_empty_rect;
2640
0
  sp.pending = fz_empty_rect;
2641
0
  sp.id = id;
2642
0
  sp.exp = exp;
2643
0
  fz_walk_path(ctx, path, &split_path_rects, &sp);
2644
2645
0
  have_leftovers = fz_is_valid_rect(sp.leftovers);
2646
2647
0
  maybe_rect(ctx, &sp);
2648
2649
0
  if (fz_is_valid_rect(sp.pending))
2650
0
    add_vector(ctx, page, sp.dev, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb, id, exp);
2651
0
  if (have_leftovers)
2652
0
    add_vector(ctx, page, sp.dev, sp.leftovers, sp.flags, sp.argb, id, exp);
2653
0
}
2654
2655
static void
2656
fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2657
0
{
2658
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2659
0
  fz_stext_page *page = tdev->page;
2660
0
  fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
2661
0
  fz_rect *bounds = actualtext_bounds(tdev);
2662
2663
  /* If we're in an actualtext, then update the bounds to include this content. */
2664
0
  if (bounds != NULL)
2665
0
    *bounds = fz_union_rect(*bounds, path_bounds);
2666
2667
0
  if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2668
0
    check_for_strikeout(ctx, tdev, page, path, ctm);
2669
2670
0
  if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2671
0
    add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, 0, 0);
2672
0
}
2673
2674
static void
2675
fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2676
0
{
2677
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2678
0
  fz_stext_page *page = tdev->page;
2679
0
  fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm);
2680
0
  fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
2681
0
  float exp = ss->linewidth / 2;
2682
2683
  /* If we're in an actualtext, then update the bounds to include this content. */
2684
0
  if (bounds != NULL)
2685
0
    *bounds = fz_union_rect(*bounds, path_bounds);
2686
2687
0
  if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2688
0
    check_for_strikeout(ctx, tdev, page, path, ctm);
2689
2690
0
  if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2691
0
    add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, 1, exp);
2692
0
}
2693
2694
static void
2695
new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
2696
0
{
2697
0
  fz_stext_struct *str;
2698
0
  size_t z;
2699
2700
0
  if (raw == NULL)
2701
0
    raw = "";
2702
0
  z = strlen(raw);
2703
2704
0
  str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1);
2705
0
  str->first_block = NULL;
2706
0
  str->last_block = NULL;
2707
0
  str->standard = standard;
2708
0
  str->parent = page->last_struct;
2709
0
  str->up = block;
2710
0
  memcpy(str->raw, raw, z+1);
2711
2712
0
  block->u.s.down = str;
2713
0
}
2714
2715
fz_stext_block *
2716
fz_new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_structure standard, const char *raw, int idx)
2717
0
{
2718
0
  fz_stext_block *block;
2719
2720
0
  block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
2721
0
  block->bbox = fz_empty_rect;
2722
0
  block->prev = NULL;
2723
0
  block->next = NULL;
2724
0
  block->type = FZ_STEXT_BLOCK_STRUCT;
2725
0
  block->u.s.index = idx;
2726
0
  block->u.s.down = NULL;
2727
  /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
2728
0
  new_stext_struct(ctx, page, block, standard, raw);
2729
2730
0
  return block;
2731
0
}
2732
2733
2734
static void
2735
fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx)
2736
0
{
2737
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2738
0
  fz_stext_page *page = tdev->page;
2739
0
  fz_stext_block *block, *le, *gt, *newblock;
2740
2741
0
  if (raw == NULL)
2742
0
    raw = "";
2743
2744
  /* Find a pointer to the last block. */
2745
0
  if (page->last_block)
2746
0
  {
2747
0
    block = page->last_block;
2748
0
  }
2749
0
  else if (page->last_struct)
2750
0
  {
2751
0
    block = page->last_struct->last_block;
2752
0
  }
2753
0
  else
2754
0
  {
2755
0
    block = page->first_block;
2756
0
  }
2757
2758
  /* So block is somewhere in the content chain. Let's try and find:
2759
   *   le = the struct node <= idx before block in the content chain.
2760
   *   ge = the struct node >= idx after block in the content chain.
2761
   * Search backwards to start with.
2762
   */
2763
0
  gt = NULL;
2764
0
  le = block;
2765
0
  while (le)
2766
0
  {
2767
0
    if (le->type == FZ_STEXT_BLOCK_STRUCT)
2768
0
    {
2769
0
      if (le->u.s.index > idx)
2770
0
        gt = le;
2771
0
      if (le->u.s.index <= idx)
2772
0
        break;
2773
0
    }
2774
0
    le = le->prev;
2775
0
  }
2776
  /* The following loop copes with finding gt (the smallest block with an index higher
2777
   * than we want) if we haven't found it already. The while loop in here was designed
2778
   * to cope with 'block' being in the middle of a list. In fact, the way the code is
2779
   * currently, block will always be at the end of a list, so the while won't do anything.
2780
   * But I'm loathe to remove it in case we ever change this code to start from wherever
2781
   * we did the last insertion. */
2782
0
  if (gt == NULL)
2783
0
  {
2784
0
    gt = block;
2785
0
    while (gt)
2786
0
    {
2787
0
      if (gt->type == FZ_STEXT_BLOCK_STRUCT)
2788
0
      {
2789
0
        if (gt->u.s.index <= idx)
2790
0
          le = gt;
2791
0
        if (gt->u.s.index >= idx)
2792
0
          break;
2793
0
      }
2794
0
      block = gt;
2795
0
      gt = gt->next;
2796
0
    }
2797
0
  }
2798
2799
0
  if (le && le->u.s.index == idx)
2800
0
  {
2801
    /* We want to move down into the le block. Does it have a struct
2802
     * attached yet? */
2803
0
    if (le->u.s.down == NULL)
2804
0
    {
2805
      /* No. We need to create a new struct node. */
2806
0
      new_stext_struct(ctx, page, le, standard, raw);
2807
0
    }
2808
0
    else if (le->u.s.down->standard != standard || strcmp(raw, le->u.s.down->raw) != 0)
2809
0
    {
2810
      /* Yes, but it doesn't match the one we expect! */
2811
0
      fz_warn(ctx, "Mismatched structure type!");
2812
0
    }
2813
0
    page->last_struct = le->u.s.down;
2814
0
    page->last_block = le->u.s.down->last_block;
2815
2816
0
    return;
2817
0
  }
2818
2819
  /* We are going to need to create a new block. Create a complete unlinked one here. */
2820
0
  newblock = fz_new_stext_struct(ctx, page, standard, raw, idx);
2821
2822
  /* So now we just need to link it in somewhere. */
2823
0
  if (gt)
2824
0
  {
2825
    /* Link it in before gt. */
2826
0
    newblock->prev = gt->prev;
2827
0
    if (gt->prev)
2828
0
      gt->prev->next = newblock;
2829
0
    else if (page->last_struct)
2830
0
    {
2831
      /* We're linking it in at the start under another struct! */
2832
0
      assert(page->last_struct->first_block == gt);
2833
0
      assert(page->last_struct->last_block != NULL);
2834
0
      page->last_struct->first_block = newblock;
2835
0
    }
2836
0
    else
2837
0
    {
2838
      /* We're linking it in at the start of the page! */
2839
0
      assert(page->first_block == gt);
2840
0
      page->first_block = newblock;
2841
0
    }
2842
0
    gt->prev = newblock;
2843
0
    newblock->next = gt;
2844
0
    newblock->id = gt->id;
2845
0
  }
2846
0
  else if (block)
2847
0
  {
2848
    /* Link it in at the end of the list (i.e. after 'block') */
2849
0
    newblock->prev = block;
2850
0
    block->next = newblock;
2851
0
    if (page->last_struct)
2852
0
    {
2853
0
      assert(page->last_struct->last_block == block);
2854
0
      page->last_struct->last_block = newblock;
2855
0
    }
2856
0
    else
2857
0
    {
2858
0
      assert(page->last_block == block);
2859
0
      page->last_block = newblock;
2860
0
    }
2861
0
    newblock->id = block->id;
2862
0
  }
2863
0
  else if (page->last_struct)
2864
0
  {
2865
    /* We have no blocks at all at this level. */
2866
0
    page->last_struct->first_block = newblock;
2867
0
    page->last_struct->last_block = newblock;
2868
0
    newblock->id = page->last_struct->up->id;
2869
0
  }
2870
0
  else
2871
0
  {
2872
    /* We have no blocks at ANY level. */
2873
0
    page->first_block = newblock;
2874
    /* newblock will have an id of 0. Best we can do. */
2875
0
  }
2876
  /* Wherever we linked it in, that's where we want to continue adding content. */
2877
0
  page->last_struct = newblock->u.s.down;
2878
0
  page->last_block = NULL;
2879
0
}
2880
2881
static void
2882
fz_stext_end_structure(fz_context *ctx, fz_device *dev)
2883
0
{
2884
0
  fz_stext_device *tdev = (fz_stext_device*)dev;
2885
0
  fz_stext_page *page = tdev->page;
2886
0
  fz_stext_struct *str = page->last_struct;
2887
2888
0
  if (str == NULL)
2889
0
  {
2890
0
    fz_warn(ctx, "Structure out of sync");
2891
0
    return;
2892
0
  }
2893
2894
0
  page->last_struct = str->parent;
2895
0
  if (page->last_struct == NULL)
2896
0
  {
2897
0
    page->last_block = page->first_block;
2898
    /* Yuck */
2899
0
    while (page->last_block->next)
2900
0
      page->last_block = page->last_block->next;
2901
0
  }
2902
0
  else
2903
0
  {
2904
0
    page->last_block = page->last_struct->last_block;
2905
0
  }
2906
0
}
2907
2908
fz_device *
2909
fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
2910
0
{
2911
0
  return fz_new_stext_device_for_page(ctx, page, opts, 0, 0, fz_empty_rect);
2912
0
}
2913
2914
fz_device *
2915
fz_new_stext_device_for_page(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts, int chapter_num, int page_num, fz_rect mediabox)
2916
0
{
2917
0
  fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
2918
2919
0
  dev->super.close_device = fz_stext_close_device;
2920
0
  dev->super.drop_device = fz_stext_drop_device;
2921
2922
0
  dev->super.fill_text = fz_stext_fill_text;
2923
0
  dev->super.stroke_text = fz_stext_stroke_text;
2924
0
  dev->super.clip_text = fz_stext_clip_text;
2925
0
  dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
2926
0
  dev->super.ignore_text = fz_stext_ignore_text;
2927
0
  dev->super.begin_metatext = fz_stext_begin_metatext;
2928
0
  dev->super.end_metatext = fz_stext_end_metatext;
2929
2930
0
  dev->super.fill_shade = fz_stext_fill_shade;
2931
0
  dev->super.fill_image = fz_stext_fill_image;
2932
0
  dev->super.fill_image_mask = fz_stext_fill_image_mask;
2933
2934
0
  if (opts)
2935
0
  {
2936
0
    dev->flags = opts->flags;
2937
0
    if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE)
2938
0
    {
2939
0
      dev->super.begin_structure = fz_stext_begin_structure;
2940
0
      dev->super.end_structure = fz_stext_end_structure;
2941
0
    }
2942
0
    if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES))
2943
0
    {
2944
0
      dev->super.fill_path = fz_stext_fill_path;
2945
0
      dev->super.stroke_path = fz_stext_stroke_path;
2946
0
    }
2947
0
  }
2948
0
  dev->page = page;
2949
0
  dev->pen.x = 0;
2950
0
  dev->pen.y = 0;
2951
0
  dev->trm = fz_identity;
2952
0
  dev->lastchar = ' ';
2953
0
  dev->lastline = NULL;
2954
0
  dev->lasttext = NULL;
2955
0
  dev->lastbidi = 0;
2956
0
  dev->last_was_fake_bold = 1;
2957
0
  if (opts)
2958
0
    dev->opts = *opts;
2959
2960
  /* If we are ignoring images, then it'd be nice to skip the decode costs. BUT we still need them to tell
2961
   * us the bounds for ActualText, so we can only actually skip them if we are ignoring actualtext too. */
2962
0
  if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && (dev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0)
2963
0
    dev->super.hints |= FZ_DONT_DECODE_IMAGES;
2964
2965
0
  dev->rect_max = 0;
2966
0
  dev->rect_len = 0;
2967
0
  dev->rects = NULL;
2968
2969
  /* Push a new id */
2970
0
  fz_try(ctx)
2971
0
  {
2972
0
    fz_stext_page_details *deets;
2973
0
    size_t id;
2974
0
    deets = fz_pool_array_append(ctx, page->id_list, &id);
2975
0
    dev->id = (int)id;
2976
0
    deets->mediabox = mediabox;
2977
0
    deets->chapter = chapter_num;
2978
0
    deets->page = page_num;
2979
0
  }
2980
0
  fz_catch(ctx)
2981
0
  {
2982
0
    fz_free(ctx, dev);
2983
0
    fz_rethrow(ctx);
2984
0
  }
2985
2986
0
  page->mediabox = fz_union_rect(page->mediabox, mediabox);
2987
2988
0
  return (fz_device*)dev;
2989
0
}