Coverage Report

Created: 2026-06-08 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/fitz/stext-boxer.c
Line
Count
Source
1
// Copyright (C) 2025 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
25
#undef DEBUG_WRITE_AS_PS
26
#undef DEBUG_STRUCT
27
28
typedef struct boxer_s boxer_t;
29
30
typedef struct {
31
  int len;
32
  int max;
33
  double fudge;
34
  fz_rect list[FZ_FLEXIBLE_ARRAY];
35
} rectlist_t;
36
37
struct boxer_s {
38
  fz_rect mediabox;
39
  rectlist_t *list;
40
  int tight;
41
};
42
43
static rectlist_t *
44
rectlist_create(fz_context *ctx, int max, double fudge)
45
0
{
46
0
  rectlist_t *list = fz_malloc_flexible(ctx, rectlist_t, list, max);
47
48
0
  list->len = 0;
49
0
  list->max = max;
50
0
  list->fudge = fudge;
51
52
0
  return list;
53
0
}
54
55
/* Push box onto rectlist, unless it is completely enclosed by
56
 * another box, or completely encloses others (in which case they
57
 * are replaced by it). */
58
static void
59
rectlist_append(rectlist_t *list, fz_rect *box)
60
0
{
61
0
  int i;
62
  /* We allow ourselves a fudge factor when checking for inclusion.
63
   * This is either 4 points, or 0 points, depending on whether
64
   * we are running in 'tight' mode or not. */
65
0
  double r_fudge = list->fudge;
66
67
0
  for (i = 0; i < list->len; i++)
68
0
  {
69
0
    fz_rect *r = &list->list[i];
70
0
    fz_rect smaller, larger;
71
72
0
    smaller.x0 = r->x0 + r_fudge;
73
0
    larger. x0 = r->x0 - r_fudge;
74
0
    smaller.y0 = r->y0 + r_fudge;
75
0
    larger. y0 = r->y0 - r_fudge;
76
0
    smaller.x1 = r->x1 - r_fudge;
77
0
    larger. x1 = r->x1 + r_fudge;
78
0
    smaller.y1 = r->y1 - r_fudge;
79
0
    larger. y1 = r->y1 + r_fudge;
80
81
0
    if (fz_contains_rect(larger, *box))
82
0
      return; /* box is enclosed! Nothing to do. */
83
0
    if (fz_contains_rect(*box, smaller))
84
0
    {
85
      /* box encloses r. Ditch r. */
86
      /* Shorten the list */
87
0
      --list->len;
88
      /* If the one that just got chopped off wasn't r, move it down. */
89
0
      if (i < list->len)
90
0
      {
91
0
        memmove(r, &list->list[list->len], sizeof(*r));
92
0
        i--; /* Reconsider this entry next time. */
93
0
      }
94
0
    }
95
0
  }
96
97
0
  assert(list->len < list->max);
98
0
  memcpy(&list->list[list->len], box, sizeof(*box));
99
0
  list->len++;
100
0
}
101
102
static boxer_t *
103
boxer_create_length(fz_context *ctx, fz_rect *mediabox, int len, int tight)
104
0
{
105
0
  boxer_t *boxer = fz_malloc_struct(ctx, boxer_t);
106
107
0
  if (boxer == NULL)
108
0
    return NULL;
109
110
0
  memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox));
111
0
  boxer->list = rectlist_create(ctx, len, tight ? 0 : 4);
112
0
  boxer->tight = tight;
113
114
0
  return boxer;
115
0
}
116
117
/* Create a boxer structure for a page of size mediabox. */
118
static boxer_t *
119
boxer_create(fz_context *ctx, fz_rect *mediabox, int tight)
120
0
{
121
0
  boxer_t *boxer = boxer_create_length(ctx, mediabox, 1, tight);
122
123
0
  if (boxer == NULL)
124
0
    return NULL;
125
0
  rectlist_append(boxer->list, mediabox);
126
127
0
  return boxer;
128
0
}
129
130
static void
131
push_if_intersect_suitable(rectlist_t *dst, const fz_rect *a, const fz_rect *b)
132
0
{
133
0
  fz_rect c;
134
135
  /* Intersect a and b. */
136
0
  c = fz_intersect_rect(*a, *b);
137
  /* If no intersection, nothing to push. */
138
0
  if (!fz_is_valid_rect(c))
139
0
    return;
140
141
0
  rectlist_append(dst, &c);
142
0
}
143
144
static void
145
boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const fz_rect *box)
146
0
{
147
0
  int i;
148
149
0
  for (i = 0; i < src->len; i++)
150
0
    push_if_intersect_suitable(dst, &src->list[i], box);
151
0
}
152
153
/* Mark a given box as being occupied (typically by a glyph) */
154
static void boxer_feed(fz_context *ctx, boxer_t *boxer, fz_rect *bbox)
155
0
{
156
0
  fz_rect box;
157
  /* When we feed a box into a the boxer, we can never make
158
  * the list more than 4 times as long. */
159
0
  rectlist_t *newlist = rectlist_create(ctx, boxer->list->len * 4, boxer->list->fudge);
160
161
#ifdef DEBUG_WRITE_AS_PS
162
  printf("0 0 1 setrgbcolor\n");
163
  printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n",
164
    bbox->x0, bbox->y0,
165
    bbox->x0, bbox->y1,
166
    bbox->x1, bbox->y1,
167
    bbox->x1, bbox->y0
168
  );
169
#endif
170
171
  /* Left (0,0) (x0,H) */
172
0
  box.x0 = boxer->mediabox.x0;
173
0
  box.y0 = boxer->mediabox.y0;
174
0
  box.x1 = bbox->x0;
175
0
  box.y1 = boxer->mediabox.y1;
176
0
  boxlist_feed_intersect(newlist, boxer->list, &box);
177
178
  /* Right (x1,0) (W,H) */
179
0
  box.x0 = bbox->x1;
180
0
  box.y0 = boxer->mediabox.y0;
181
0
  box.x1 = boxer->mediabox.x1;
182
0
  box.y1 = boxer->mediabox.y1;
183
0
  boxlist_feed_intersect(newlist, boxer->list, &box);
184
185
  /* Bottom (0,0) (W,y0) */
186
0
  box.x0 = boxer->mediabox.x0;
187
0
  box.y0 = boxer->mediabox.y0;
188
0
  box.x1 = boxer->mediabox.x1;
189
0
  box.y1 = bbox->y0;
190
0
  boxlist_feed_intersect(newlist, boxer->list, &box);
191
192
  /* Top (0,y1) (W,H) */
193
0
  box.x0 = boxer->mediabox.x0;
194
0
  box.y0 = bbox->y1;
195
0
  box.x1 = boxer->mediabox.x1;
196
0
  box.y1 = boxer->mediabox.y1;
197
0
  boxlist_feed_intersect(newlist, boxer->list, &box);
198
199
0
  fz_free(ctx, boxer->list);
200
0
  boxer->list = newlist;
201
0
}
202
203
#ifdef DEBUG_WRITE_AS_PS
204
static int
205
compare_areas(const void *a_, const void *b_)
206
{
207
  const fz_rect *a = (const fz_rect *)a_;
208
  const fz_rect *b = (const fz_rect *)b_;
209
  double area_a = (a->x1-a->x0) * (a->y1-a->y0);
210
  double area_b = (b->x1-b->x0) * (b->y1-b->y0);
211
212
  if (area_a < area_b)
213
    return 1;
214
  else if (area_a > area_b)
215
    return -1;
216
  else
217
    return 0;
218
}
219
220
/* Sort the rectangle list to be largest area first. For ease of humans
221
 * reading debug output. */
222
static void boxer_sort(boxer_t *boxer)
223
{
224
  qsort(boxer->list->list, boxer->list->len, sizeof(fz_rect), compare_areas);
225
}
226
227
/* Get the rectangle list for a given boxer. Return value is the length of
228
 * the list. Lifespan is until the boxer is modified or freed. */
229
static int boxer_results(boxer_t *boxer, fz_rect **list)
230
{
231
  *list = boxer->list->list;
232
  return boxer->list->len;
233
}
234
#endif
235
236
/* Currently unused debugging routine */
237
#if 0
238
static void
239
boxer_dump(fz_context *ctx, boxer_t *boxer)
240
{
241
  int i;
242
243
  printf("bbox = %g %g %g %g\n", boxer->mediabox.x0, boxer->mediabox.y0, boxer->mediabox.x1, boxer->mediabox.y1);
244
  for (i = 0; i < boxer->list->len; i++)
245
  {
246
    printf("%d %g %g %g %g\n", i, boxer->list->list[i].x0, boxer->list->list[i].y0, boxer->list->list[i].x1, boxer->list->list[i].y1);
247
  }
248
}
249
#endif
250
251
/* Destroy a boxer. */
252
static void boxer_destroy(fz_context *ctx, boxer_t *boxer)
253
0
{
254
0
  if (!boxer)
255
0
    return;
256
257
0
  fz_free(ctx, boxer->list);
258
0
  fz_free(ctx, boxer);
259
0
}
260
261
/* Find the margins for a given boxer. */
262
static fz_rect boxer_margins(boxer_t *boxer)
263
0
{
264
0
  rectlist_t *list = boxer->list;
265
0
  int i;
266
0
  fz_rect margins = boxer->mediabox;
267
268
0
  for (i = 0; i < list->len; i++)
269
0
  {
270
0
    fz_rect *r = &list->list[i];
271
0
    if (r->x0 <= margins.x0 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
272
0
      margins.x0 = r->x1; /* Left Margin */
273
0
    else if (r->x1 >= margins.x1 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
274
0
      margins.x1 = r->x0; /* Right Margin */
275
0
    else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y0 <= margins.y0)
276
0
      margins.y0 = r->y1; /* Top Margin */
277
0
    else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y1 >= margins.y1)
278
0
      margins.y1 = r->y0; /* Bottom Margin */
279
0
  }
280
281
0
  return margins;
282
0
}
283
284
/* Create a new boxer from a subset of an old one. */
285
static boxer_t *boxer_subset(fz_context *ctx, boxer_t *boxer, fz_rect rect)
286
0
{
287
0
  boxer_t *new_boxer = boxer_create_length(ctx, &rect, boxer->list->len, boxer->tight);
288
0
  int i;
289
290
0
  if (new_boxer == NULL)
291
0
    return NULL;
292
293
0
  for (i = 0; i < boxer->list->len; i++)
294
0
  {
295
0
    fz_rect r = fz_intersect_rect(boxer->list->list[i], rect);
296
297
0
    if (fz_is_empty_rect(r))
298
0
      continue;
299
0
    rectlist_append(new_boxer->list, &r);
300
0
  }
301
302
0
  return new_boxer;
303
0
}
304
305
static int analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *big_boxer, int depth);
306
307
static void
308
analyse_subset(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *boxer, fz_rect r, int depth)
309
0
{
310
0
  boxer_t *sub_box = boxer_subset(ctx, boxer, r);
311
312
0
  fz_try(ctx)
313
0
    (void)analyse_sub(ctx, page, parent, sub_box, depth);
314
0
  fz_always(ctx)
315
0
    boxer_destroy(ctx, sub_box);
316
0
  fz_catch(ctx)
317
0
    fz_rethrow(ctx);
318
0
}
319
320
/* Consider a boxer for subdivision.
321
 * Returns 0 if no suitable subdivision point found.
322
 * Returns 1 if a subdivision point is found.*/
323
static int
324
boxer_subdivide(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *boxer, int depth)
325
0
{
326
0
  rectlist_t *list = boxer->list;
327
0
  double max_size = 0;
328
0
  int i;
329
0
  int horiz = 0;
330
0
  int largest = -1;
331
0
  fz_rect r;
332
333
0
  for (i = 0; i < list->len; i++)
334
0
  {
335
0
    r = list->list[i];
336
337
0
    if (r.x0 <= boxer->mediabox.x0 && r.x1 >= boxer->mediabox.x1)
338
0
    {
339
      /* Horizontal divider */
340
0
      double size = r.y1 - r.y0;
341
0
      if (size > max_size)
342
0
      {
343
0
        max_size = size;
344
0
        largest = i;
345
0
        horiz = 1;
346
0
      }
347
0
    }
348
0
    if (r.y0 <= boxer->mediabox.y0 && r.y1 >= boxer->mediabox.y1)
349
0
    {
350
      /* Vertical divider */
351
0
      double size = r.x1 - r.x0;
352
0
      if (size > max_size)
353
0
      {
354
0
        max_size = size;
355
0
        largest = i;
356
0
        horiz = 0;
357
0
      }
358
0
    }
359
0
  }
360
361
0
  if (largest == -1)
362
0
  {
363
#ifdef DEBUG_WRITE_AS_PS
364
    {
365
      printf("%% SUBSET\n");
366
      printf("0 1 1 setrgbcolor\n");
367
      printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
368
      boxer->mediabox.x0, boxer->mediabox.y0,
369
      boxer->mediabox.x0, boxer->mediabox.y1,
370
      boxer->mediabox.x1, boxer->mediabox.y1,
371
      boxer->mediabox.x1, boxer->mediabox.y0);
372
    }
373
#endif
374
0
    return 0;
375
0
  }
376
377
0
  r = boxer->mediabox;
378
0
  if (horiz)
379
0
  {
380
    /* Divider runs horizontally. */
381
#ifdef DEBUG_WRITE_AS_PS
382
    {
383
      printf("%% H DIVIDER\n");
384
      printf("1 0 1 setrgbcolor\n");
385
      printf("%g %g moveto\n%g %g lineto\nstroke\n\n",
386
      list->list[largest].x0, (list->list[largest].y0 + list->list[largest].y1) * 0.5f,
387
      list->list[largest].x1, (list->list[largest].y0 + list->list[largest].y1) * 0.5f);
388
    }
389
#endif
390
0
    r.y1 = list->list[largest].y0;
391
0
    analyse_subset(ctx, page, parent, boxer, r, depth);
392
393
0
    r.y0 = list->list[largest].y1;
394
0
    r.y1 = boxer->mediabox.y1;
395
0
    analyse_subset(ctx, page, parent, boxer, r, depth);
396
0
  }
397
0
  else
398
0
  {
399
    /* Divider runs vertically. */
400
#ifdef DEBUG_WRITE_AS_PS
401
    {
402
      printf("%% V DIVIDER\n");
403
      printf("1 0 1 setrgbcolor\n");
404
      printf("%g %g moveto\n%g %g lineto\nstroke\n\n",
405
      (list->list[largest].x0 + list->list[largest].x1) * 0.5f, list->list[largest].y0,
406
      (list->list[largest].x0 + list->list[largest].x1) * 0.5f, list->list[largest].y1);
407
    }
408
#endif
409
0
    r.x1 = list->list[largest].x0;
410
0
    analyse_subset(ctx, page, parent, boxer, r, depth);
411
412
0
    r.x0 = list->list[largest].x1;
413
0
    r.x1 = boxer->mediabox.x1;
414
0
    analyse_subset(ctx, page, parent, boxer, r, depth);
415
0
  }
416
417
0
  return 1;
418
0
}
419
420
#ifdef DEBUG_STRUCT
421
static void
422
do_dump_stext(fz_stext_block *block, int depth)
423
{
424
  int d;
425
  int idx = -1;
426
427
  while (block)
428
  {
429
    for (d = 0; d < depth; d++)
430
      printf(" ");
431
    switch(block->type)
432
    {
433
      case FZ_STEXT_BLOCK_TEXT:
434
        printf("TEXT %p\n", block);
435
        break;
436
      case FZ_STEXT_BLOCK_IMAGE:
437
        printf("IMAGE %p\n", block);
438
        break;
439
      case FZ_STEXT_BLOCK_VECTOR:
440
        printf("VECTOR %p\n", block);
441
        break;
442
      case FZ_STEXT_BLOCK_STRUCT:
443
        printf("STRUCT %p (idx=%d)\n", block, block->u.s.index);
444
        assert(block->u.s.index > idx);
445
        idx = block->u.s.index;
446
        do_dump_stext(block->u.s.down->first_block, depth+1);
447
        break;
448
    }
449
    block = block->next;
450
  }
451
}
452
453
static void
454
dump_stext(char *str, fz_stext_block *block)
455
{
456
  printf("%s\n", str);
457
458
  do_dump_stext(block, 0);
459
}
460
#endif
461
462
static void
463
recalc_bbox(fz_stext_block *block)
464
0
{
465
0
  fz_rect bbox = fz_empty_rect;
466
0
  fz_stext_line *line;
467
468
0
  for (line = block->u.t.first_line; line != NULL; line = line->next)
469
0
    bbox = fz_union_rect(bbox, line->bbox);
470
471
0
  block->bbox = bbox;
472
0
}
473
474
static fz_stext_struct *
475
page_subset(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, fz_rect mediabox)
476
0
{
477
0
  fz_stext_block *block, *next_block;
478
0
  fz_stext_block *target = NULL; /* The first block in our target list */
479
0
  fz_stext_block *last = NULL; /* The last block in our target list */
480
0
  fz_stext_struct *target_parent = NULL;
481
0
  fz_stext_block *after = NULL; /* The block we want to insert after (NULL=start of list) */
482
0
  fz_stext_block *newblock;
483
0
  int idx = 0;
484
0
  int idx2;
485
#ifdef DEBUG_STRUCT
486
  dump_stext("BEFORE", parent ? parent->first_block : page->first_block);
487
#endif
488
489
0
  block = parent ? parent->first_block : page->first_block;
490
0
  while (block != NULL)
491
0
  {
492
0
    fz_rect bbox;
493
494
0
    next_block = block->next;
495
496
0
    bbox = block->bbox;
497
498
    /* Can we take the whole block? */
499
0
    if (bbox.x0 >= mediabox.x0 && bbox.y0 >= mediabox.y0 && bbox.x1 <= mediabox.x1 && bbox.y1 <= mediabox.y1)
500
0
    {
501
      /* Unlink block from the current list. */
502
0
      if (block->prev)
503
0
        block->prev->next = next_block;
504
0
      else if (parent)
505
0
        parent->first_block = next_block;
506
0
      else
507
0
        page->first_block = next_block;
508
0
      if (next_block)
509
0
        next_block->prev = block->prev;
510
0
      else if (parent)
511
0
        parent->last_block = block->prev;
512
0
      else
513
0
        page->last_block = block->prev;
514
515
      /* Add block onto our target list */
516
0
      if (target == NULL)
517
0
      {
518
0
        target = block;
519
0
        target_parent = parent;
520
0
        after = block->prev;
521
0
        block->prev = NULL;
522
0
      }
523
0
      else
524
0
      {
525
0
        last->next = block;
526
0
        block->prev = last;
527
0
      }
528
0
      last = block;
529
0
      block->next = NULL;
530
0
    }
531
0
    else if (fz_is_empty_rect(fz_intersect_rect(bbox, mediabox)))
532
0
    {
533
0
    }
534
0
    else if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
535
0
    {
536
0
      parent = block->u.s.down;
537
0
      next_block = parent->first_block;
538
0
    }
539
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
540
0
    {
541
      /* Need to look at the parts. */
542
0
      fz_stext_line *line, *next_line;
543
544
0
      newblock = NULL;
545
0
      for (line = block->u.t.first_line; line != NULL; line = next_line)
546
0
      {
547
0
        next_line = line->next;
548
0
        if (line->bbox.x0 >= mediabox.x0 && line->bbox.y0 >= mediabox.y0 && line->bbox.x1 <= mediabox.x1 && line->bbox.y1 <= mediabox.y1)
549
0
        {
550
          /* We need to take this line */
551
0
          if (newblock == NULL)
552
0
          {
553
0
            newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
554
555
            /* Add the block onto our target list */
556
0
            if (target == NULL)
557
0
            {
558
0
              target = newblock;
559
0
              target_parent = parent;
560
0
              if (line == block->u.t.first_line)
561
0
                after = block->prev;
562
0
              else
563
0
                after = block;
564
0
            }
565
0
            else
566
0
            {
567
0
              last->next = newblock;
568
0
              newblock->prev = last;
569
0
            }
570
0
            last = newblock;
571
0
            newblock->id = block->id;
572
0
          }
573
574
          /* Unlink line from the current list. */
575
0
          if (line->prev)
576
0
            line->prev->next = next_line;
577
0
          else
578
0
            block->u.t.first_line = next_line;
579
0
          if (next_line)
580
0
            next_line->prev = line->prev;
581
0
          else
582
0
            block->u.t.last_line = line->prev;
583
584
          /* Add line onto our new block */
585
0
          if (newblock->u.t.last_line == NULL)
586
0
          {
587
0
            newblock->u.t.first_line = newblock->u.t.last_line = line;
588
0
            line->prev = NULL;
589
0
          }
590
0
          else
591
0
          {
592
0
            line->prev = newblock->u.t.last_line;
593
0
            newblock->u.t.last_line->next = line;
594
0
            newblock->u.t.last_line = line;
595
0
          }
596
0
          line->next = NULL;
597
0
        }
598
0
      }
599
0
      if (newblock)
600
0
      {
601
0
        recalc_bbox(block);
602
0
        recalc_bbox(newblock);
603
0
      }
604
0
    }
605
606
    /* Step onwards (or upwards) */
607
0
    block = next_block;
608
0
    while (block == NULL)
609
0
    {
610
0
      if (parent == NULL)
611
0
        break;
612
0
      block = parent->up->next;
613
0
      parent = parent->parent;
614
0
    }
615
0
  }
616
617
  /* If no content to add, bale! */
618
0
  if (target == NULL)
619
0
    return NULL;
620
621
  /* We want to insert a structure node that contains target after 'after'. */
622
0
  block = target_parent ? target_parent->first_block : page->first_block;
623
0
  if (after != NULL)
624
0
  {
625
0
    while (1)
626
0
    {
627
0
      if (block->type == FZ_STEXT_BLOCK_STRUCT)
628
0
        idx = block->u.s.index+1;
629
0
      if (block == after)
630
0
        break;
631
0
      block = block->next;
632
0
    }
633
0
    block = block->next;
634
0
  }
635
  /* So we want to insert a structure node with index 'idx' after 'after' */
636
  /* Ensure that the following structure nodes have sane index values */
637
0
  idx2 = idx+1;
638
0
  for (; block != NULL; block = block->next)
639
0
  {
640
0
    if (block->type != FZ_STEXT_BLOCK_STRUCT)
641
0
      continue;
642
0
    if (block->u.s.index >= idx2)
643
0
      break;
644
0
    block->u.s.index = idx2;
645
0
    idx2++;
646
0
  }
647
648
  /* Convert from 'after' to 'before'. */
649
0
  if (after)
650
0
    block = after->next;
651
0
  else if (target_parent)
652
0
    block = target_parent->first_block;
653
0
  else
654
0
    block = page->first_block;
655
656
  /* So we want to insert just before block, with index 'idx'. */
657
658
  /* We are going to need to create a new block. Create a complete unlinked one here. */
659
0
  newblock = fz_new_stext_struct(ctx, page, FZ_STRUCTURE_DIV, "Split", idx);
660
0
  if (block)
661
0
    newblock->prev = block->prev;
662
0
  else if (target_parent)
663
0
    newblock->prev = target_parent->last_block;
664
0
  else
665
0
    newblock->prev = page->last_block;
666
0
  newblock->next = block;
667
0
  newblock->id = target->id;
668
669
  /* Now insert newblock just before block */
670
  /* If block was first, now we are. */
671
0
  if (target_parent)
672
0
  {
673
0
    if (target_parent->first_block == block)
674
0
      target_parent->first_block = newblock;
675
0
  }
676
0
  else if (page->first_block == block)
677
0
    page->first_block = newblock;
678
0
  if (block == NULL)
679
0
  {
680
    /* Inserting at the end! */
681
0
    if (target_parent)
682
0
    {
683
0
      if (target_parent->last_block)
684
0
        target_parent->last_block->next = newblock;
685
0
      target_parent->last_block = newblock;
686
0
    }
687
0
    else
688
0
    {
689
0
      if (page->last_block)
690
0
        page->last_block->next = newblock;
691
0
      page->last_block = newblock;
692
0
    }
693
0
  }
694
0
  else
695
0
  {
696
0
    if (block->prev)
697
0
      block->prev->next = newblock;
698
0
    block->prev = newblock;
699
0
  }
700
701
0
  newblock->u.s.down->first_block = target;
702
0
  newblock->u.s.down->last_block = last;
703
0
  newblock->u.s.down->parent = target_parent;
704
0
  target->prev = NULL;
705
706
0
  for (block = target; block->next != NULL; block = block->next)
707
0
  {
708
0
    newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
709
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
710
0
      block->u.s.down->parent = newblock->u.s.down;
711
0
  }
712
713
0
  newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
714
0
  newblock->u.s.down->last_block = block;
715
716
#ifdef DEBUG_STRUCT
717
  dump_stext("AFTER", parent ? parent->first_block : page->first_block);
718
#endif
719
720
0
  return newblock->u.s.down;
721
0
}
722
723
enum {
724
  MAX_ANALYSIS_DEPTH = 6
725
};
726
727
static int
728
analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *big_boxer, int depth)
729
0
{
730
0
  fz_rect margins;
731
0
  boxer_t *boxer;
732
0
  fz_stext_struct *div;
733
0
  int ret = 0;
734
735
  /* Find the margins in the enclosing boxer. This returns
736
   * a subset of the bbox of the original. */
737
0
  margins = boxer_margins(big_boxer);
738
#ifdef DEBUG_WRITE_AS_PS
739
  printf("\n\n%% MARGINS %g %g %g %g\n", margins.x0, margins.y0, margins.x1, margins.y1);
740
#endif
741
742
  /* Now subset the rectangles just to include those that are in our bbox. */
743
0
  boxer = boxer_subset(ctx, big_boxer, margins);
744
745
0
  fz_try(ctx)
746
0
  {
747
0
    div = page_subset(ctx, page, parent, boxer->mediabox);
748
    /* If nothing subsetted (no textual content in that region), give up. */
749
0
    if (div == NULL)
750
0
      break;
751
752
0
    ret = 1;
753
754
0
    if (depth < MAX_ANALYSIS_DEPTH)
755
0
    {
756
      /* Can we subdivide that region any more? */
757
0
      if (boxer_subdivide(ctx, page, div, boxer, depth+1))
758
0
        break;
759
0
    }
760
0
  }
761
0
  fz_always(ctx)
762
0
  {
763
0
    boxer_destroy(ctx, boxer);
764
0
  }
765
0
  fz_catch(ctx)
766
0
    fz_rethrow(ctx);
767
768
0
  return ret;
769
0
}
770
771
static int
772
line_isnt_all_spaces(fz_context *ctx, fz_stext_line *line)
773
0
{
774
0
  fz_stext_char *ch;
775
0
  for (ch = line->first_char; ch != NULL; ch = ch->next)
776
0
    if (ch->c != 32 && ch->c != 160)
777
0
      return 1;
778
0
  return 0;
779
0
}
780
781
static void
782
feed_line(fz_context *ctx, boxer_t *boxer, fz_stext_line *line)
783
0
{
784
0
  fz_stext_char *ch;
785
786
0
  for (ch = line->first_char; ch != NULL; ch = ch->next)
787
0
  {
788
0
    fz_rect r = fz_empty_rect;
789
790
0
    if (ch->c == ' ')
791
0
      continue;
792
793
0
    do
794
0
    {
795
0
      fz_rect bbox = fz_rect_from_quad(ch->quad);
796
0
      float margin = boxer->tight ? 0 : ch->size/4;
797
0
      bbox.x0 -= margin;
798
0
      bbox.y0 -= margin;
799
0
      bbox.x1 += margin;
800
0
      bbox.y1 += margin;
801
0
      r = fz_union_rect(r, bbox);
802
0
      ch = ch->next;
803
0
    }
804
0
    while (ch != NULL && ch->c != ' ');
805
0
    boxer_feed(ctx, boxer, &r);
806
0
    if (ch == NULL)
807
0
      break;
808
0
  }
809
0
}
810
811
/* Internal, non-API function, shared with stext-table. */
812
fz_rect
813
fz_collate_small_vector_run(fz_stext_block **blockp)
814
0
{
815
0
  fz_stext_block *block = *blockp;
816
0
  fz_stext_block *next;
817
0
  fz_rect r = block->bbox;
818
0
  int MAX_SIZE = 2;
819
0
  int MAX_GAP = 2;
820
821
0
  float w = r.x1 - r.x0;
822
0
  float h = r.y1 - r.y0;
823
824
0
  if (w < MAX_SIZE)
825
0
  {
826
0
    while ((next = block->next) != NULL &&
827
0
      next->type == FZ_STEXT_BLOCK_VECTOR &&
828
0
      next->bbox.x0 == r.x0 &&
829
0
      next->bbox.x1 == r.x1 &&
830
0
      ((next->bbox.y1 > r.y1 && next->bbox.y0 <= r.y1 + MAX_GAP) ||
831
0
      (next->bbox.y0 < r.y0 && next->bbox.y1 >= r.y0 - MAX_GAP)))
832
0
    {
833
0
      r = fz_union_rect(r, next->bbox);
834
0
      block = next;
835
0
    }
836
0
  }
837
0
  if (h < MAX_SIZE)
838
0
  {
839
0
    while ((next = block->next) != NULL &&
840
0
      next->type == FZ_STEXT_BLOCK_VECTOR &&
841
0
      next->bbox.y0 == r.y0 &&
842
0
      next->bbox.y1 == r.y1 &&
843
0
      ((next->bbox.x1 > r.x1 && next->bbox.x0 <= r.x1 + MAX_GAP) ||
844
0
      (next->bbox.x0 < r.x0 && next->bbox.x1 >= r.x0 - MAX_GAP)))
845
0
    {
846
0
      r = fz_union_rect(r, next->bbox);
847
0
      block = next;
848
0
    }
849
0
  }
850
851
0
  *blockp = block;
852
853
0
  return r;
854
0
}
855
856
static void
857
recurse_and_feed(fz_context *ctx, boxer_t *boxer, fz_stext_block *block)
858
0
{
859
0
  for (; block != NULL; block = block->next)
860
0
  {
861
0
    fz_stext_line *line;
862
0
    switch (block->type)
863
0
    {
864
0
    case FZ_STEXT_BLOCK_TEXT:
865
0
      for (line = block->u.t.first_line; line != NULL; line = line->next)
866
0
        if (line_isnt_all_spaces(ctx, line))
867
0
          feed_line(ctx, boxer, line);
868
0
      break;
869
0
    case FZ_STEXT_BLOCK_VECTOR:
870
0
    {
871
      /* Allow a 1 point margin around vectors to avoid hairline
872
       * cracks between supposedly abutting things. */
873
0
      int VECTOR_MARGIN = 1;
874
0
      fz_rect r = fz_collate_small_vector_run(&block);
875
876
0
      r.x0 -= VECTOR_MARGIN;
877
0
      r.y0 -= VECTOR_MARGIN;
878
0
      r.x1 += VECTOR_MARGIN;
879
0
      r.y1 += VECTOR_MARGIN;
880
0
      boxer_feed(ctx, boxer, &r);
881
0
      break;
882
0
    }
883
0
    case FZ_STEXT_BLOCK_STRUCT:
884
0
      if(block->u.s.down)
885
0
        recurse_and_feed(ctx, boxer, block->u.s.down->first_block);
886
0
      break;
887
0
    default:
888
0
      boxer_feed(ctx, boxer, &block->bbox);
889
0
      break;
890
0
    }
891
0
  }
892
0
}
893
894
static int
895
segment_rect(fz_context *ctx, fz_rect box, fz_stext_page *page, fz_stext_struct *parent, int tight)
896
0
{
897
0
  boxer_t *boxer;
898
0
  int ret = 0;
899
900
0
  boxer = boxer_create(ctx, &box, tight);
901
902
0
  fz_try(ctx)
903
0
  {
904
0
    recurse_and_feed(ctx, boxer, parent ? parent->first_block : page->first_block);
905
906
0
    ret = analyse_sub(ctx, page, parent, boxer, 0);
907
0
  }
908
0
  fz_always(ctx)
909
0
    boxer_destroy(ctx, boxer);
910
0
  fz_catch(ctx)
911
0
    fz_rethrow(ctx);
912
913
#ifdef DEBUG_WRITE_AS_PS
914
  printf("showpage\n");
915
#endif
916
917
0
  return ret;
918
0
}
919
920
int fz_segment_stext_rect(fz_context *ctx, fz_stext_page *page, fz_rect rect)
921
0
{
922
#ifdef DEBUG_WRITE_AS_PS
923
  printf("1 -1 scale 0 -%g translate\n", rect.y1-rect.y0);
924
#endif
925
926
0
  return segment_rect(ctx, rect, page, NULL, 1);
927
0
}
928
929
int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page)
930
0
{
931
0
  fz_stext_block *block;
932
933
0
  fz_stext_remove_page_fill(ctx, page);
934
935
  /* If we have structure already, give up. We can't hope to beat
936
   * proper structure! */
937
0
  for (block = page->first_block; block != NULL; block = block->next)
938
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
939
0
      return 0;
940
941
#ifdef DEBUG_WRITE_AS_PS
942
  printf("1 -1 scale 0 -%g translate\n", page->mediabox.y1-page->mediabox.y0);
943
#endif
944
945
0
  return segment_rect(ctx, page->mediabox, page, NULL, 0);
946
0
}
947
948
int
949
fz_stext_remove_page_fill(fz_context *ctx, fz_stext_page *page)
950
0
{
951
0
  fz_stext_page_block_iterator iter;
952
0
  int dropped = 0;
953
0
  fz_rect coverage = fz_empty_rect;
954
955
  /* First, find the area actually covered on the page. */
956
0
  for (iter = fz_stext_page_block_iterator_begin_dfs(page); !fz_stext_page_block_iterator_eod_dfs(iter); iter = fz_stext_page_block_iterator_next_dfs(iter))
957
0
  {
958
    /* Try to ignore stuff that's completely off screen */
959
0
    fz_rect bbox = fz_intersect_rect(page->mediabox, iter.block->bbox);
960
961
0
    coverage = fz_union_rect(coverage, bbox);
962
0
  }
963
964
  /* Iterate across all the blocks in the page in a depth first order. We'll break out
965
   * when we find the first one that is not a white (or transparent) rectangle fill
966
   * that covers a significant amount of the page. This therefore copes with several
967
   * page fills on the same page. */
968
0
  for (iter = fz_stext_page_block_iterator_begin_dfs(page); !fz_stext_page_block_iterator_eod_dfs(iter); iter = fz_stext_page_block_iterator_next_dfs(iter))
969
0
  {
970
0
    fz_rect bbox;
971
972
    /* Stop searching when we find something that's not a vector */
973
0
    if (iter.block->type != FZ_STEXT_BLOCK_VECTOR)
974
0
      break;
975
976
    /* Stop searching when we find a vector that's not a rectangle */
977
0
    if ((iter.block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0)
978
0
      break;
979
980
    /* Stop searching when we find a vector that's stroked */
981
0
    if ((iter.block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED) != 0)
982
0
      break;
983
984
    /* Stop searching when we find a vector that's not white (or invisible) */
985
0
    if ((iter.block->u.v.argb & 0xff000000) != 0 && (iter.block->u.v.argb & 0xffffff) != 0xffffff)
986
0
      break;
987
988
    /* If we don't cover the coverage area, then we can't be a background. */
989
0
    bbox = fz_expand_rect(iter.block->bbox, 0.1f); /* Allow for rounding */
990
0
    if (!fz_contains_rect(bbox, coverage))
991
0
      break;
992
993
    /* If we don't cover at least 90% of the height/width, then give up. */
994
0
    if (fz_is_infinite_rect(page->mediabox))
995
0
    {
996
      /* Can't judge. Skip this check. */
997
0
    }
998
0
    else if ((iter.block->bbox.y1 - iter.block->bbox.y0) < 0.9f * (page->mediabox.y1 - page->mediabox.y0) ||
999
0
      (iter.block->bbox.x1 - iter.block->bbox.x0) < 0.9f * (page->mediabox.x1 - page->mediabox.x0))
1000
0
    {
1001
0
      break;
1002
0
    }
1003
1004
    /* This is a background block. Remove it. This will be the first block
1005
     * in the list, so it's simpler. */
1006
0
    if (iter.parent)
1007
0
    {
1008
0
      iter.parent->first_block = iter.block->next;
1009
0
      if (iter.block->next)
1010
0
        iter.block->next->prev = NULL;
1011
0
      else
1012
0
        iter.parent->last_block = NULL;
1013
0
    }
1014
0
    else
1015
0
    {
1016
0
      iter.page->first_block = iter.block->next;
1017
0
      if (iter.block->next)
1018
0
        iter.block->next->prev = NULL;
1019
0
      else
1020
0
        iter.page->last_block = NULL;
1021
0
    }
1022
1023
0
    dropped = 1;
1024
0
  }
1025
1026
0
  return dropped;
1027
0
}