Coverage Report

Created: 2025-01-28 06:17

/src/mupdf/source/fitz/stext-table.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2025 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
25
#include <assert.h>
26
27
/* #define DEBUG_WRITE_AS_PS */
28
29
/* #define DEBUG_TABLE_STRUCTURE */
30
31
/* #define DEBUG_TABLE_HUNT */
32
33
/*
34
 * The algorithm.
35
 *
36
 *  The goal of the algorithm is to identify tables on a page.
37
 *  First we have to find the tables on a page, then we have to
38
 *  figure out where the columns/rows are, and then how the
39
 *  cells span them.
40
 *
41
 *  We do this as a series of steps.
42
 *
43
 *  To illustrate what's going on, let's use an example page
44
 *  that we can follow through all the steps.
45
 *
46
 *  +---------------------------+
47
 *  |                           |
48
 *  |      #### ## ### ##       | <- Title
49
 *  |                           |
50
 *  |    ##### ##### #### ##    |   \
51
 *  |    ## ###### ###### ##    |    |
52
 *  |    #### ####### ######    |    |- Abstract
53
 *  |    ####### #### ## ###    |    |
54
 *  |    ### ##### ######       |   /
55
 *  |                           |
56
 *  |   #########   #########   |   2 Columns of text
57
 *  |   #########   #########   |
58
 *  |   ########    #########   |
59
 *  |               #########   |
60
 *  |   +-------+   #######     |   <- With an image on the left
61
 *  |   |       |               |
62
 *  |   |       |   ## ## # #   |   <- And a table on the right
63
 *  |   +-------+   ## ## # #   |
64
 *  |               ## ## # #   |
65
 *  |   #########   ## ## # #   |
66
 *  |   #########   ## ## # #   |
67
 *  |   #########   ## ## # #   |
68
 *  |                           |
69
 *  +---------------------------+
70
 *
71
 *
72
 * Step 1: Segmentation.
73
 *
74
 *  First, we segment the page, trying to break it down into a
75
 *  series of non-overlapping rectangles. We do this (in stext-boxer.c)
76
 *  by looking for where the content isn't. If we can identify breaks
77
 *  that run through the page (either from top to bottom or from left
78
 *  to right), then we can split the page there, and recursively consider
79
 *  the two halves of the page.
80
 *
81
 *  It's not a perfect algorithm, but it manages to in many cases.
82
 *
83
 *  After segmenting the above example, first we'll find the horizontal
84
 *  splits, giving:
85
 *
86
 *  +---------------------------+
87
 *  |                           |
88
 *  |      #### ## ### ##       |
89
 *  +---------------------------+
90
 *  |    ##### ##### #### ##    |
91
 *  |    ## ###### ###### ##    |
92
 *  |    #### ####### ######    |
93
 *  |    ####### #### ## ###    |
94
 *  |    ### ##### ######       |
95
 *  +---------------------------+
96
 *  |   #########   #########   |
97
 *  |   #########   #########   |
98
 *  |   ########    #########   |
99
 *  |               #########   |
100
 *  |   +-------+   #######     |
101
 *  |   |       |               |
102
 *  |   |       |   ## ## # #   |
103
 *  |   +-------+   ## ## # #   |
104
 *  |               ## ## # #   |
105
 *  |   #########   ## ## # #   |
106
 *  |   #########   ## ## # #   |
107
 *  |   #########   ## ## # #   |
108
 *  |                           |
109
 *  +---------------------------+
110
 *
111
 *  Then we'll recurse and find the vertical split between
112
 *  the columns:
113
 *
114
 *  +---------------------------+
115
 *  |                           |
116
 *  |      #### ## ### ##       |
117
 *  +---------------------------+
118
 *  |    ##### ##### #### ##    |
119
 *  |    ## ###### ###### ##    |
120
 *  |    #### ####### ######    |
121
 *  |    ####### #### ## ###    |
122
 *  |    ### ##### ######       |
123
 *  +-------------+-------------+
124
 *  |   ######### | #########   |
125
 *  |   ######### | #########   |
126
 *  |   ########  | #########   |
127
 *  |             | #########   |
128
 *  |   +-------+ | #######     |
129
 *  |   |       | |             |
130
 *  |   |       | | ## ## # #   |
131
 *  |   +-------+ | ## ## # #   |
132
 *  |             | ## ## # #   |
133
 *  |   ######### | ## ## # #   |
134
 *  |   ######### | ## ## # #   |
135
 *  |   ######### | ## ## # #   |
136
 *  |             |             |
137
 *  +-------------+-------------+
138
 *
139
 *  Then we recurse again and find the horizontal splits
140
 *  within the columns:
141
 *
142
 *  +---------------------------+
143
 *  |                           |
144
 *  |      #### ## ### ##       |
145
 *  +---------------------------+
146
 *  |    ##### ##### #### ##    |
147
 *  |    ## ###### ###### ##    |
148
 *  |    #### ####### ######    |
149
 *  |    ####### #### ## ###    |
150
 *  |    ### ##### ######       |
151
 *  +-------------+-------------+
152
 *  |   ######### | #########   |
153
 *  |   ######### | #########   |
154
 *  |   ########  | #########   |
155
 *  +-------------+ #########   |
156
 *  |   +-------+ | #######     |
157
 *  |   |       | +-------------+
158
 *  |   |       | | ## ## # #   |
159
 *  |   +-------+ | ## ## # #   |
160
 *  +-------------+ ## ## # #   |
161
 *  |   ######### | ## ## # #   |
162
 *  |   ######### | ## ## # #   |
163
 *  |   ######### | ## ## # #   |
164
 *  |             |             |
165
 *  +-------------+-------------+
166
 *
167
 *  We recurse a fixed maximum number of times (currently
168
 *  6, IIRC) or until we fail to find any suitable splits.
169
 *
170
 *  This completes the page segmentation step.
171
 *
172
 * Step 2: Grid finding
173
 *
174
 *  Next, we look at each of those segments and try to identify
175
 *  where grids might be.
176
 *
177
 *  Imagine the bottom right section of that page as
178
 *  a board with lego blocks on where there is text.
179
 *  Now imagine viewing that from the bottom of the page.
180
 *  The gaps between the columns of the table are where you
181
 *  can see through to the top between the blocks.
182
 *
183
 *  Similarly, if you view it from the side, the gaps between the
184
 *  rows of the page are where you can see through to the other
185
 *  side.
186
 *
187
 *  So, how do we code that? Well, we run through the page content
188
 *  (obviously, restricted to the content that falls into this
189
 *  segment of the page - that'll go without saying from here on
190
 *  in). For each bit of content, we look at the "x extent" of that
191
 *  content - for instance a given string might start at position
192
 *  10 and continue to position 100. We build a list of all these
193
 *  start, and stop positions, and keep them in a sorted list.
194
 *
195
 *  Then we walk this list from left to right, keeping a sum. I
196
 *  call this sum "wind", because it's very similar to the winding
197
 *  number that you get when doing scan conversion of bezier shapes.
198
 *
199
 *  wind starts out as 0. We increment it whenever we pass a 'start'
200
 *  position, and decrement it whenever we pass a 'stop' position.
201
 *  So at any given x position along the line wind tells us the
202
 *  number of pieces of content that overlap that x position.
203
 *  So wind(left) = 0 = wind(right), and wind(x) >= x for all x.
204
 *
205
 *  So, if we walk from left to right, the trace of wind might
206
 *  look something like:
207
 *
208
 *               __
209
 *    ___       /  \  _        __
210
 *   /   \     /    \/ \     _/  \_
211
 *  /     \___/         \___/      \
212
 *
213
 *  The left and right edges of the table are pretty clear.
214
 *  The regions where wind drops to 0 represent the column dividers.
215
 *  The left and right hand side of those regions gives us the min
216
 *  and max values for that divider.
217
 *
218
 *  We can then repeat this process for Y ranges instead of X ranges
219
 *  to get the row dividers.
220
 *
221
 *  BUT, this only works for pure grid tables. It falls down for
222
 *  cases where we have merged cells (which is very common due to
223
 *  titles etc).
224
 *
225
 *  We can modify the algorithm slightly to allow for this.
226
 *
227
 *  Consider the following table:
228
 *
229
 *  +-----------------------------------+
230
 *  |  Long Table title across the top  |
231
 *  +---------------+---------+---------+
232
 *  | Name          | Result1 | Result2 |
233
 *  +---------------+----+----+----+----+
234
 *  | Homer Simpson |  1 | 23 |  4 | 56 |
235
 *  | Barney Gumble |  1 | 23 |  4 | 56 |
236
 *  | Moe           |  1 | 23 |  4 | 56 |
237
 *  | Apu           |  1 | 23 |  4 | 56 |
238
 *  | Ned Flanders  |  1 | 23 |  4 | 56 |
239
 *  +---------------+----+----+----+----+
240
 *
241
 *  The wind trace for that looks something like
242
 *  (with a certain degree of artistic license for the
243
 *  limitations of ascii art):
244
 *
245
 *     ________
246
 *    /        \      _   __     _   _
247
 *   /          \____/ \_/  \___/ \_/ \
248
 *  /                                  \
249
 *
250
 *  So, the trace never quite drops back to zero in the
251
 *  middle due to the spanning of the top title.
252
 *
253
 *  So, instead of just looking for points where the trace
254
 *  drops to zero, we instead look for local minima. Each local
255
 *  minima represents a place where there might be a grid dividier.
256
 *  The value of wind at such points can be considered the
257
 *  "uncertainty" with which there might be a divider there.
258
 *  Clear dividers (with a wind value of 0) have no uncertainty.
259
 *  Places where cells are spanned have a higher value of uncertainty.
260
 *
261
 *  The output from this step is the list of possible grid positions
262
 *  (X and Y), with uncertainty values.
263
 *
264
 * Step 3: Cell analysis
265
 *
266
 *  So, armed with the output from step 2, we can examine each grid
267
 *  found. If we have W x-dividers and H y-dividers, we know we have
268
 *  a potential table with (W-1) x (H-1) cells in it.
269
 *
270
 *  We represent this as a W x H grid of cells, each like:
271
 *
272
 *          .       .
273
 *          .       .
274
 *     . . .+-------+. . .  Each cell holds information about the
275
 *          |       . edges above, and to the left of it.
276
 *          |       .
277
 *          |       .
278
 *     . . .+. . . .+. . .
279
 *          .       .
280
 *          .       .
281
 *
282
 *  h_line: Is there a horizontal divider drawn on the page that
283
 *  corresponds to the top of this cell (i.e. is there a cell border
284
 *  here?)
285
 *  v_line: Is there a vertical divider drawn on the page that
286
 *  corresponds to the left of this cell (i.e. is there a cell border
287
 *  here?)
288
 *  h_crossed: Does content cross this line (i.e. are we merged
289
 *  with the cell above?)
290
 *  v_crossed: Does content cross this line (i.e. are we merged
291
 *  with the cell to the left?)
292
 *  full: Is there any content in this cell at all?
293
 *
294
 *  We need a W x H grid of cells to represent the entire table due
295
 *  to the potential right and bottom edge lines. The right and
296
 *  bottom rows of cells should never be full, or be crossed, but
297
 *  it's easiest just to use a simple representation that copes with
298
 *  the h_line and v_line values naturally.
299
 *
300
 *  So, we start with the cells structure empty, and we run through
301
 *  the page content, filling in the details as we go.
302
 *
303
 *  At the end of the process, we have enough information to draw
304
 *  an asciiart representation of our table. It might look something
305
 *  like this (this comes from dotted-gridlines-tables.pdf):
306
 *
307
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+
308
 *  | | |   | | |#|#|#|#| | | |
309
 *  + + + + + + +v+ +v+v+ + + +
310
 *  | | |   | | |#|#|#|#| | | |
311
 *  + + + + + + + +v+ + + + + +
312
 *  | |#|   | |#|#|#|#|#|#|#|#|
313
 *  + +v+ + + +v+v+ +v+v+v+v+v+
314
 *  | |#|   |#|#|#|#|#|#|#|#|#|
315
 *  + + + + +v+ + +v+ + + + + +
316
 *  |#|#|  #|#|#|#|#|#|#|#|#|#|
317
 *  +v+v+ +v+ +v+v+ +v+v+v+v+v+
318
 *  |#|#|  #|#|#|#|#|#|#|#|#|#|
319
 *  + + + + +v+ + +v+ + + + + +
320
 *  | |#|   |#|#|#|#|#|#|#|#|#|
321
 *  + +v+ + + +v+v+ +v+v+v+v+v+
322
 *  | |#|   | |#|#|#|#|#|#|#|#|
323
 *  + + + + + + + +v+ + + + + +
324
 *  | | |   | | |#|#|#|#| | | |
325
 *  + + + + + + +v+ +v+v+ + + +
326
 *  | | |   | | |#|#|#|#| | | |
327
 *  +-+-+-+-+-+-+-+-+-+-+-+-+-+
328
 *    | |#  |#| | |#| | |#|#|#|
329
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
330
 *    | |#  |#| | |#| | |#|#|#|
331
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
332
 *    | |#>#|#| | |#| | |#|#|#|
333
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
334
 *    | |#>#|#| | |#| | |#|#|#|
335
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
336
 *    | |#>#|#| |#| | | |#|#|#|
337
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
338
 *    | |#  |#| | |#| | |#|#|#|
339
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
340
 *    | |#  |#| | |#| | |#|#|#|
341
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
342
 *    | |#  |#| | |#| | |#|#|#|
343
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
344
 *    | |#  |#| | |#| | |#|#|#|
345
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
346
 *    |#    |#|#|#|#|#|#|#|#|#|
347
 *  + +-+-+-+-+-+-+-+-+-+-+-+-+
348
 *
349
 *  This shows where lines are detected ( - and | ),
350
 *  where they are crossed ( > and v) and where cells
351
 *  are full ( # ).
352
 *
353
 * Step 4: Row and column merging.
354
 *
355
 *  Based on the information above, we then try to merge
356
 *  cells and columns to simplify the table.
357
 *
358
 *  The best rules I've come up with this so far are:
359
 *  We can merge two adjacent columns if all the pairs of
360
 *  cells in the two columns are mergeable.
361
 *
362
 *  Cells are held to be mergable or not based upon the following
363
 *  rules:
364
 *    If there is a line between 2 cells - not mergeable.
365
 *    else if the uncertainty between 2 cells is 0 - not mergeable.
366
 *    else if the line between the 2 cells is crossed - mergeable.
367
 *    else if strictly one of the cells is full - mergeable.
368
 *    else not mergeable.
369
 *
370
 *  So in the above example, column 2 (numbered from 0) can be merged
371
 *  with column 3.
372
 *
373
 *  This gives:
374
 *
375
 *  +-+-+-+-+-+-+-+-+-+-+-+-+
376
 *  | | | | | |#|#|#|#| | | |
377
 *  + + + + + +v+ +v+v+ + + +
378
 *  | | | | | |#|#|#|#| | | |
379
 *  + + + + + + +v+ + + + + +
380
 *  | |#| | |#|#|#|#|#|#|#|#|
381
 *  + +v+ + +v+v+ +v+v+v+v+v+
382
 *  | |#| |#|#|#|#|#|#|#|#|#|
383
 *  + + + +v+ + +v+ + + + + +
384
 *  |#|#|#|#|#|#|#|#|#|#|#|#|
385
 *  +v+v+v+ +v+v+ +v+v+v+v+v+
386
 *  |#|#|#|#|#|#|#|#|#|#|#|#|
387
 *  + + + +v+ + +v+ + + + + +
388
 *  | |#| |#|#|#|#|#|#|#|#|#|
389
 *  + +v+ + +v+v+ +v+v+v+v+v+
390
 *  | |#| | |#|#|#|#|#|#|#|#|
391
 *  + + + + + + +v+ + + + + +
392
 *  | | | | | |#|#|#|#| | | |
393
 *  + + + + + +v+ +v+v+ + + +
394
 *  | | | | | |#|#|#|#| | | |
395
 *  +-+-+-+-+-+-+-+-+-+-+-+-+
396
 *    | |#|#| | |#| | |#|#|#|
397
 *  + +-+-+-+-+-+-+-+-+-+-+-+
398
 *    | |#|#| | |#| | |#|#|#|
399
 *  + +-+-+-+-+-+-+-+-+-+-+-+
400
 *    | |#|#| | |#| | |#|#|#|
401
 *  + +-+-+-+-+-+-+-+-+-+-+-+
402
 *    | |#|#| | |#| | |#|#|#|
403
 *  + +-+-+-+-+-+-+-+-+-+-+-+
404
 *    | |#|#| |#| | | |#|#|#|
405
 *  + +-+-+-+-+-+-+-+-+-+-+-+
406
 *    | |#|#| | |#| | |#|#|#|
407
 *  + +-+-+-+-+-+-+-+-+-+-+-+
408
 *    | |#|#| | |#| | |#|#|#|
409
 *  + +-+-+-+-+-+-+-+-+-+-+-+
410
 *    | |#|#| | |#| | |#|#|#|
411
 *  + +-+-+-+-+-+-+-+-+-+-+-+
412
 *    | |#|#| | |#| | |#|#|#|
413
 *  + +-+-+-+-+-+-+-+-+-+-+-+
414
 *    |#  |#|#|#|#|#|#|#|#|#|
415
 *  + +-+-+-+-+-+-+-+-+-+-+-+
416
 *
417
 *  We then perform the same merging process for rows as for
418
 *  columns - though there are no rows in the above example
419
 *  that can be merged.
420
 *
421
 *  You'll note that, for example, we don't merge row 0 and
422
 *  row 1 in the above, because we have a pair of cells that
423
 *  are both full without crossing.
424
 *
425
 * Step 5: Cell spanning
426
 *
427
 *  Now we actually start to output the table. We keep a 'sent_table'
428
 *  (a grid of W x H bools) to keep track of whether we've output
429
 *  the content for a given cell or not yet.
430
 *
431
 *  For each cell we reach, assuming sent_table[x,y] is false,
432
 *  we merge it with as many cells on the right as required,
433
 *  according to 'v_crossed' values (subject to not passing
434
 *  v_lines or uncertainty == 0's).
435
 *
436
 *  We then try to merge cells below according to 'h_crossed'
437
 *  values (subject to not passing h_lines or uncertainty == 0's).
438
 *
439
 *  In theory this can leave us with some cases where we'd like
440
 *  to merge some cells (because of crossed) and can't (because
441
 *  of lines or sent_table[]) values. In the absence of better
442
 *  cell spanning algorithms we have no choice here.
443
 *
444
 *  Then we output the contents and set sent_table[] values as
445
 *  appropriate.
446
 *
447
 *  If a row has no cells in it, we currently omit the TR. If/when
448
 *  we figure out how to indicate rowspan/colspan in stext, we can
449
 *  revisit that.
450
 */
451
452
453
static fz_stext_block *
454
add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block)
455
0
{
456
0
  fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block));
457
0
  memset(block, 0, sizeof(*block));
458
0
  block->type = FZ_STEXT_BLOCK_GRID;
459
0
  block->bbox = fz_empty_rect; /* Fixes bug 703267. */
460
0
  block->next = *first_block;
461
0
  if (*first_block)
462
0
  {
463
0
    (*first_block)->prev = block;
464
0
    assert(*last_block);
465
0
  }
466
0
  else
467
0
  {
468
0
    assert(*last_block == NULL);
469
0
    *last_block = block;
470
0
  }
471
0
  *first_block = block;
472
0
  return block;
473
0
}
474
475
static void
476
insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest)
477
0
{
478
0
  if (before)
479
0
  {
480
    /* We have a block to insert it before, so we know it's not the last. */
481
0
    assert(dest->first_block != NULL && dest->last_block != NULL);
482
0
    block->next = before;
483
0
    block->prev = before->prev;
484
0
    if (before->prev)
485
0
    {
486
0
      assert(before->prev->next == before);
487
0
      before->prev->next = block;
488
0
    }
489
0
    else if (dest)
490
0
    {
491
0
      assert(dest->first_block == before);
492
0
      dest->first_block = block;
493
0
    }
494
0
    else
495
0
    {
496
0
      assert(page->first_block == before);
497
0
      page->first_block = block;
498
0
    }
499
0
    before->prev = block;
500
0
  }
501
0
  else if (dest)
502
0
  {
503
    /* Will be the last block. */
504
0
    block->next = NULL;
505
0
    block->prev = dest->last_block;
506
0
    if (dest->last_block)
507
0
    {
508
0
      assert(dest->last_block->next == NULL);
509
0
      dest->last_block->next = block;
510
0
    }
511
0
    if (dest->first_block == NULL)
512
0
      dest->first_block = block;
513
0
    dest->last_block = block;
514
0
  }
515
0
  else
516
0
  {
517
    /* Will be the last block. */
518
0
    block->next = NULL;
519
0
    block->prev = page->last_block;
520
0
    if (page->last_block)
521
0
    {
522
0
      assert(page->last_block->next == NULL);
523
0
      page->last_block->next = block;
524
0
    }
525
0
    if (page->first_block == NULL)
526
0
      page->first_block = block;
527
0
    page->last_block = block;
528
0
  }
529
0
}
530
531
static fz_stext_struct *
532
add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw)
533
0
{
534
0
  fz_stext_block *block;
535
0
  int idx = 0;
536
0
  size_t z;
537
0
  fz_stext_struct *newstruct;
538
539
0
  if (raw == NULL)
540
0
    raw = "";
541
0
  z = strlen(raw);
542
543
  /* We're going to insert a struct block. We need an idx, so walk the list */
544
0
  for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next)
545
0
  {
546
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
547
0
    {
548
0
      assert(block->u.s.index >= idx);
549
0
      idx = block->u.s.index + 1;
550
0
    }
551
0
  }
552
  /* So we'll add our block as idx. But all the other struct blocks that follow us need to have
553
   * larger values. */
554
555
  /* Update all the subsequent structs to have a higher idx */
556
0
  if (before)
557
0
  {
558
0
    int idx2 = idx+1;
559
0
    for (block = before->next; block != NULL; block = block->next)
560
0
    {
561
0
      if (block->type != FZ_STEXT_BLOCK_STRUCT)
562
0
        continue;
563
0
      if (block->u.s.index > idx2)
564
0
        break;
565
0
      block->u.s.index = idx2++;
566
0
    }
567
0
  }
568
569
  /* Now make our new struct block and insert it. */
570
0
  block = fz_pool_alloc(ctx, page->pool, sizeof(*block));
571
0
  block->type = FZ_STEXT_BLOCK_STRUCT;
572
0
  block->bbox = fz_empty_rect; /* Fixes bug 703267. */
573
0
  insert_block_before(block, before, page, parent);
574
575
0
  block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z);
576
0
  block->u.s.index = idx;
577
0
  newstruct->parent = parent;
578
0
  newstruct->standard = std;
579
0
  memcpy(newstruct->raw, raw, z);
580
0
  newstruct->raw[z] = 0;
581
0
  newstruct->up = block;
582
583
0
  return newstruct;
584
0
}
585
586
typedef struct
587
{
588
  int len;
589
  int max;
590
  struct {
591
    int left;
592
    float pos;
593
    int freq;
594
  } *list;
595
} div_list;
596
597
static void
598
div_list_push(fz_context *ctx, div_list *div, int left, float pos)
599
0
{
600
0
  int i;
601
602
  /* FIXME: Could be bsearch. */
603
0
  for (i = 0; i < div->len; i++)
604
0
  {
605
0
    if (div->list[i].pos > pos)
606
0
      break;
607
0
    else if (div->list[i].pos == pos && div->list[i].left == left)
608
0
    {
609
0
      div->list[i].freq++;
610
0
      return;
611
0
    }
612
0
  }
613
614
0
  if (div->len == div->max)
615
0
  {
616
0
    int newmax = div->max * 2;
617
0
    if (newmax == 0)
618
0
      newmax = 32;
619
0
    div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax);
620
0
    div->max = newmax;
621
0
  }
622
623
0
  if (i < div->len)
624
0
    memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i));
625
0
  div->len++;
626
0
  div->list[i].left = left;
627
0
  div->list[i].pos = pos;
628
0
  div->list[i].freq = 1;
629
0
}
630
631
static fz_stext_grid_positions *
632
make_table_positions(fz_context *ctx, div_list *xs, float min, float max)
633
0
{
634
0
  int wind;
635
0
  fz_stext_grid_positions *pos;
636
0
  int len = xs->len;
637
0
  int i;
638
0
  int hi = 0;
639
640
  /* Count the number of edges */
641
0
  int local_min = 0;
642
0
  int edges = 2;
643
644
0
  if (len == 0)
645
0
    return NULL;
646
647
0
  assert(xs->list[0].left);
648
0
  for (i = 0; i < len; i++)
649
0
  {
650
0
    if (xs->list[i].left)
651
0
    {
652
0
      if (local_min)
653
0
        edges++;
654
0
    }
655
0
    else
656
0
      local_min = 1;
657
0
  }
658
0
  assert(!xs->list[i-1].left);
659
660
0
  pos = fz_calloc(ctx, 1, sizeof(*pos) + (edges-1) * sizeof(pos->list[0]));
661
0
  pos->len = edges;
662
663
  /* Copy the edges in */
664
0
  wind = 0;
665
0
  local_min = 0;
666
0
  edges = 1;
667
0
  pos->list[0].pos = xs->list[0].pos;
668
0
  pos->list[0].min = min;
669
0
  pos->list[0].max = pos->list[0].pos;
670
0
  pos->list[0].uncertainty = 0;
671
#ifdef DEBUG_TABLE_HUNT
672
  printf("|%g ", pos->list[0].pos);
673
#endif
674
0
  for (i = 0; i < len; i++)
675
0
  {
676
0
    if (xs->list[i].left)
677
0
    {
678
0
      if (local_min)
679
0
      {
680
0
        pos->list[edges].min = xs->list[i-1].pos;
681
0
        pos->list[edges].max = xs->list[i].pos;
682
0
        pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2;
683
0
        pos->list[edges++].uncertainty = wind;
684
#ifdef DEBUG_TABLE_HUNT
685
        if (wind)
686
          printf("?%g(%d) ", pos->list[edges].pos, wind);
687
        else
688
          printf("|%g ", pos->list[edges].pos);
689
#endif
690
0
      }
691
0
      wind += xs->list[i].freq;
692
0
      if (wind > hi)
693
0
        hi = wind;
694
0
    }
695
0
    else
696
0
    {
697
0
      wind -= xs->list[i].freq;
698
0
      local_min = 1;
699
0
    }
700
0
  }
701
0
  assert(wind == 0);
702
0
  pos->list[edges].pos = xs->list[i-1].pos;
703
0
  pos->list[edges].min = xs->list[i-1].pos;
704
0
  pos->list[edges].max = max;
705
0
  pos->list[edges].uncertainty = 0;
706
0
  pos->max_uncertainty = hi;
707
#ifdef DEBUG_TABLE_HUNT
708
  printf("|%g\n", pos->list[edges].pos);
709
#endif
710
711
0
  return pos;
712
0
}
713
714
static fz_stext_grid_positions *
715
clone_grid_positions(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs)
716
0
{
717
0
  size_t z = sizeof(*xs) + (xs->len-1) * sizeof(xs->list[0]);
718
0
  fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z);
719
720
0
  memcpy(xs2, xs, z);
721
722
0
  return xs2;
723
0
}
724
725
static void
726
sanitize_positions(fz_context *ctx, div_list *xs)
727
0
{
728
0
  int i, j;
729
730
#ifdef DEBUG_TABLE_HUNT
731
  printf("OK:\n");
732
  for (i = 0; i < xs->len; i++)
733
  {
734
    if (xs->list[i].left)
735
      printf("[");
736
    printf("%g(%d)", xs->list[i].pos, xs->list[i].freq);
737
    if (!xs->list[i].left)
738
      printf("]");
739
    printf(" ");
740
  }
741
  printf("\n");
742
#endif
743
744
  /* Now, combine runs of left and right */
745
0
  for (i = 0; i < xs->len; i++)
746
0
  {
747
0
    if (xs->list[i].left)
748
0
    {
749
0
      j = i;
750
0
      while (i < xs->len-1 && xs->list[i+1].left)
751
0
      {
752
0
        i++;
753
0
        xs->list[j].freq += xs->list[i].freq;
754
0
        xs->list[i].freq = 0;
755
0
      }
756
0
    }
757
0
    else
758
0
    {
759
0
      while (i < xs->len-1 && !xs->list[i+1].left)
760
0
      {
761
0
        i++;
762
0
        xs->list[i].freq += xs->list[i-1].freq;
763
0
        xs->list[i-1].freq = 0;
764
0
      }
765
0
    }
766
0
  }
767
768
#ifdef DEBUG_TABLE_HUNT
769
  printf("Shrunk:\n");
770
  for (i = 0; i < xs->len; i++)
771
  {
772
    if (xs->list[i].left)
773
      printf("[");
774
    printf("%g(%d)", xs->list[i].pos, xs->list[i].freq);
775
    if (!xs->list[i].left)
776
      printf("]");
777
    printf(" ");
778
  }
779
  printf("\n");
780
#endif
781
782
  /* Now remove the 0 frequency ones. */
783
0
  j = 0;
784
0
  for (i = 0; i < xs->len; i++)
785
0
  {
786
0
    if (xs->list[i].freq == 0)
787
0
      continue;
788
0
    if (i != j)
789
0
      xs->list[j] = xs->list[i];
790
0
    j++;
791
0
  }
792
0
  xs->len = j;
793
794
#ifdef DEBUG_TABLE_HUNT
795
  printf("Compacted:\n");
796
  for (i = 0; i < xs->len; i++)
797
  {
798
    if (xs->list[i].left)
799
      printf("[");
800
    printf("%g(%d)", xs->list[i].pos, xs->list[i].freq);
801
    if (!xs->list[i].left)
802
      printf("]");
803
    printf(" ");
804
  }
805
  printf("\n");
806
#endif
807
0
}
808
809
static int
810
all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block)
811
0
{
812
0
  for (; block != NULL; block = block->next)
813
0
  {
814
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
815
0
    {
816
0
      if (block->u.s.down == NULL)
817
0
        continue;
818
0
      if (block->u.s.down->standard == FZ_STRUCTURE_H ||
819
0
        block->u.s.down->standard == FZ_STRUCTURE_H1 ||
820
0
        block->u.s.down->standard == FZ_STRUCTURE_H2 ||
821
0
        block->u.s.down->standard == FZ_STRUCTURE_H3 ||
822
0
        block->u.s.down->standard == FZ_STRUCTURE_H4 ||
823
0
        block->u.s.down->standard == FZ_STRUCTURE_H5 ||
824
0
        block->u.s.down->standard == FZ_STRUCTURE_H6)
825
0
        continue;
826
0
      if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block))
827
0
        return 0;
828
0
    }
829
0
    if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL)
830
0
      return 0;
831
0
  }
832
833
0
  return 1;
834
0
}
835
836
static void
837
walk_blocks(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, int descend)
838
0
{
839
0
  fz_stext_block *block;
840
0
  fz_stext_line *line;
841
0
  fz_stext_char *ch;
842
843
0
  for (block = first_block; block != NULL; block = block->next)
844
0
  {
845
0
    switch (block->type)
846
0
    {
847
0
    case FZ_STEXT_BLOCK_STRUCT:
848
      /* Don't descend into */
849
0
      if (descend && block->u.s.down && !all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block))
850
0
        walk_blocks(ctx, xs, ys, block->u.s.down->first_block, descend);
851
0
      break;
852
0
    case FZ_STEXT_BLOCK_VECTOR:
853
0
      break;
854
0
    case FZ_STEXT_BLOCK_TEXT:
855
0
      for (line = block->u.t.first_line; line != NULL; line = line->next)
856
0
      {
857
0
        float rpos;
858
0
        int left = 1;
859
0
        int right = 0;
860
0
        int non_empty = 0;
861
0
        for (ch = line->first_char; ch != NULL; ch = ch->next)
862
0
        {
863
0
          if (ch->c == ' ')
864
0
          {
865
0
            if (ch->next == NULL)
866
0
            {
867
              /* This is a trailing space. We've seen cases where we get
868
               * trailing spaces on cell contents and this screws stuff
869
               * up (e.g. dotted-gridlines-tables.pdf). */
870
0
              if (right)
871
0
              {
872
                /* Send a 'right' as the left position of this space. */
873
0
                float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
874
0
                div_list_push(ctx, xs, 0, lpos);
875
0
                left = 1;
876
0
                right = 0;
877
0
              }
878
0
            }
879
0
            else if (ch->next->c == ' ')
880
0
            {
881
              /* Run of multiple spaces. Send a 'right' as the left position
882
               * of this space, and then skip forwards. */
883
0
              if (right)
884
0
              {
885
0
                float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
886
0
                div_list_push(ctx, xs, 0, lpos);
887
0
                while (ch->next && ch->next->c == ' ')
888
0
                  ch = ch->next;
889
0
                left = 1;
890
0
                right = 0;
891
0
              }
892
0
            }
893
0
            else
894
0
            {
895
              /* Ignore any other spaces. Don't start or end a run on them. */
896
0
            }
897
0
          }
898
0
          else
899
0
          {
900
0
            non_empty = 1;
901
0
            if (left)
902
0
            {
903
0
              float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
904
0
              div_list_push(ctx, xs, 1, lpos);
905
0
              left = 0;
906
0
            }
907
0
            rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x);
908
0
            right = 1;
909
0
          }
910
0
        }
911
0
        if (non_empty)
912
0
        {
913
0
          div_list_push(ctx, ys, 1, line->bbox.y0);
914
0
          div_list_push(ctx, ys, 0, line->bbox.y1);
915
0
        }
916
0
        if (right)
917
0
          div_list_push(ctx, xs, 0, rpos);
918
0
      }
919
0
      break;
920
0
    }
921
0
  }
922
0
}
923
924
/* One of our datastructures (cells_t) is an array of details about the
925
 * cells that make up our table. It's a w * h array of cell_t's. Each
926
 * cell contains data on one of the cells in the table, as you'd expect.
927
 *
928
 *     .       .
929
 *     .       .
930
 * - - +-------+ - -
931
 *     |       .
932
 *     |       .
933
 *     |       .
934
 * - - + - - - + - -
935
 *     .       .
936
 *     .       .
937
 *
938
 * For any given cell, we store details about the top (lowest y coord)
939
 * and left (lowest x coord) edges. Specifically we store whether
940
 * there is a line at this position (h_line and v_line) (i.e. a drawn
941
 * border), and we also store whether content crosses this edge (h_crossed
942
 * and y_crossed). Finally, we store whether the cell has any content
943
 * in it at all (full).
944
 *
945
 * A table which has w positions across and h positions vertically, will
946
 * only really have (w-1) * (h-1) cells. We store w*h though to allow for
947
 * the right and bottom edges to have their lines represented.
948
 */
949
950
typedef struct
951
{
952
  int h_line;
953
  int v_line;
954
  int h_crossed;
955
  int v_crossed;
956
  int full;
957
} cell_t;
958
959
typedef struct
960
{
961
  int w;
962
  int h;
963
  cell_t cell[1];
964
} cells_t;
965
966
typedef struct
967
{
968
  cells_t *cells;
969
  fz_stext_grid_positions *xpos;
970
  fz_stext_grid_positions *ypos;
971
} grid_walker_data;
972
973
static cell_t *
974
get_cell(cells_t *cells, int x, int y)
975
0
{
976
0
  return &cells->cell[x + y * cells->w];
977
0
}
978
979
static int
980
find_grid_pos_with_reinforcement(fz_context *ctx, fz_stext_grid_positions *pos, float x, int expand)
981
0
{
982
0
  int i;
983
984
0
  for (i = 0; i < pos->len; i++)
985
0
  {
986
0
    int r;
987
0
    if (x > pos->list[i].max)
988
0
      continue;
989
0
    if (x < pos->list[i].min)
990
0
    {
991
0
      if (expand && i > 0)
992
0
      {
993
0
        float mid = (pos->list[i].min + pos->list[i-1].max)/2;
994
0
        if (x < mid)
995
0
          return i-1;
996
0
        else
997
0
          return i;
998
0
      }
999
0
      return -1;
1000
0
    }
1001
0
    r = pos->list[i].reinforcement++;
1002
0
    pos->list[i].pos = (pos->list[i].pos * r + x) / (r+1);
1003
0
    return i;
1004
0
  }
1005
1006
0
  return -1;
1007
0
}
1008
1009
static int
1010
find_cell_l(fz_stext_grid_positions *pos, float x)
1011
0
{
1012
0
  int i;
1013
1014
0
  for (i = 0; i < pos->len; i++)
1015
0
    if (x < pos->list[i].pos)
1016
0
      return i-1;
1017
1018
0
  return -1;
1019
0
}
1020
1021
static int
1022
find_cell_r(fz_stext_grid_positions *pos, float x)
1023
0
{
1024
0
  int i;
1025
1026
0
  for (i = 0; i < pos->len; i++)
1027
0
    if (x <= pos->list[i].pos)
1028
0
      return i-1;
1029
1030
0
  return -1;
1031
0
}
1032
1033
static int
1034
add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1)
1035
0
{
1036
0
  int start = find_grid_pos_with_reinforcement(ctx, gd->xpos, x0, 1);
1037
0
  int end = find_grid_pos_with_reinforcement(ctx, gd->xpos, x1, 1);
1038
0
  float y = (y0 + y1) / 2;
1039
0
  int yidx = find_grid_pos_with_reinforcement(ctx, gd->ypos, y, 0);
1040
0
  int i;
1041
1042
0
  if (start < 0 || end < 0 || yidx < 0 || start >= end)
1043
0
    return 1;
1044
1045
0
  for (i = start; i < end; i++)
1046
0
    get_cell(gd->cells, i, yidx)->h_line++;
1047
1048
0
  return 0;
1049
0
}
1050
1051
static int
1052
add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1)
1053
0
{
1054
0
  int start = find_grid_pos_with_reinforcement(ctx, gd->ypos, y0, 1);
1055
0
  int end = find_grid_pos_with_reinforcement(ctx, gd->ypos, y1, 1);
1056
0
  float x = (x0 + x1) / 2;
1057
0
  int xidx = find_grid_pos_with_reinforcement(ctx, gd->xpos, x, 0);
1058
0
  int i;
1059
1060
0
  if (start < 0 || end < 0 || xidx < 0 || start >= end)
1061
0
    return 1;
1062
1063
0
  for (i = start; i < end; i++)
1064
0
    get_cell(gd->cells, xidx, i)->v_line++;
1065
1066
0
  return 0;
1067
0
}
1068
1069
/* Shared internal routine with stext-boxer.c  */
1070
fz_rect fz_collate_small_vector_run(fz_stext_block **blockp);
1071
1072
static void
1073
walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
1074
0
{
1075
0
  for (; block != NULL; block = block->next)
1076
0
  {
1077
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
1078
0
    {
1079
0
      if (block->u.s.down)
1080
0
        walk_grid_lines(ctx, gd, block->u.s.down->first_block);
1081
0
      continue;
1082
0
    }
1083
0
    else if (block->type == FZ_STEXT_BLOCK_VECTOR)
1084
0
    {
1085
0
      fz_rect r = block->bbox;
1086
0
      float w = r.x1 - r.x0;
1087
0
      float h = r.y1 - r.y0;
1088
0
      int failed = 0;
1089
0
      if (w > h && h < 1)
1090
0
      {
1091
        /* Thin, wide line */
1092
0
        failed = add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1);
1093
0
      }
1094
0
      else if (w < h && w < 1)
1095
0
      {
1096
        /* Thin, wide line */
1097
0
        failed = add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1);
1098
0
      }
1099
0
      else
1100
0
      {
1101
        /* Rectangle */
1102
0
        int failed2;
1103
0
        failed2 = add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y0);
1104
0
        failed2 |= add_h_line(ctx, gd, r.x0, r.x1, r.y1, r.y1);
1105
0
        failed = add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x0);
1106
0
        failed |= add_v_line(ctx, gd, r.y0, r.y1, r.x1, r.x1);
1107
0
        failed &= failed2;
1108
0
      }
1109
0
      if (failed)
1110
0
      {
1111
        /* Try merging multiple successive vectors to get better
1112
         * results. */
1113
0
        r = fz_collate_small_vector_run(&block);
1114
0
        w = r.x1 - r.x0;
1115
0
        h = r.y1 - r.y0;
1116
0
        if (w > h)
1117
0
        {
1118
#ifdef DEBUG
1119
          if (add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1))
1120
#endif
1121
0
            add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1);
1122
0
        }
1123
0
        else
1124
0
        {
1125
#ifdef DEBUG
1126
          if (add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1))
1127
#endif
1128
0
            add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1);
1129
0
        }
1130
0
      }
1131
0
    }
1132
0
  }
1133
0
}
1134
1135
static void
1136
erase_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
1137
0
{
1138
0
  fz_rect bounds = {
1139
0
    gd->xpos->list[0].pos,
1140
0
    gd->ypos->list[0].pos,
1141
0
    gd->xpos->list[gd->xpos->len-1].pos,
1142
0
    gd->ypos->list[gd->ypos->len-1].pos };
1143
1144
0
  for (; block != NULL; block = block->next)
1145
0
  {
1146
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
1147
0
    {
1148
0
      if (block->u.s.down)
1149
0
        erase_grid_lines(ctx, gd, block->u.s.down->first_block);
1150
0
      continue;
1151
0
    }
1152
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
1153
0
    {
1154
0
      fz_stext_line *line;
1155
1156
0
      if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 ||
1157
0
        block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0)
1158
0
        continue;
1159
1160
0
      for (line = block->u.t.first_line; line != NULL; line = line->next)
1161
0
      {
1162
0
        fz_stext_char *ch = line->first_char;
1163
1164
        /* Skip leading spaces */
1165
0
        while (ch != NULL && ch->c == ' ')
1166
0
          ch = ch->next;
1167
1168
0
        for (; ch != NULL; ch = ch->next)
1169
0
        {
1170
0
          fz_rect r;
1171
0
          int x, y, x0, x1, y0, y1;
1172
1173
0
          if (ch->c == 32)
1174
0
          {
1175
            /* Trailing space, skip it. */
1176
0
            if (ch->next == NULL)
1177
0
              break;
1178
0
            if (ch->next->c == 32)
1179
0
            {
1180
              /* Run of spaces. Skip 'em. */
1181
0
              while (ch->next && ch->next->c == 32)
1182
0
                ch = ch->next;
1183
0
              continue;
1184
0
            }
1185
            /* A single space. Accept it. */
1186
0
          }
1187
0
          r = fz_rect_from_quad(ch->quad);
1188
0
          x0 = find_cell_l(gd->xpos, r.x0);
1189
0
          x1 = find_cell_r(gd->xpos, r.x1);
1190
0
          y0 = find_cell_l(gd->ypos, r.y0);
1191
0
          y1 = find_cell_r(gd->ypos, r.y1);
1192
0
          if (x0 < 0 || x1 <0 || y0 < 0 || y1 < 0)
1193
0
            continue;
1194
0
          if (x0 < x1)
1195
0
          {
1196
0
            for (y = y0; y <= y1; y++)
1197
0
              for (x = x0; x < x1; x++)
1198
0
                get_cell(gd->cells, x+1, y)->v_crossed++;
1199
0
          }
1200
0
          if (y0 < y1)
1201
0
          {
1202
0
            for (y = y0; y < y1; y++)
1203
0
              for (x = x0; x <= x1; x++)
1204
0
                get_cell(gd->cells, x, y+1)->h_crossed++;
1205
0
          }
1206
0
          for (y = y0; y <= y1; y++)
1207
0
            for (x = x0; x <= x1; x++)
1208
0
              get_cell(gd->cells, x, y)->full++;
1209
0
        }
1210
0
      }
1211
0
    }
1212
0
  }
1213
0
}
1214
1215
static cells_t *new_cells(fz_context *ctx, int w, int h)
1216
0
{
1217
0
  cells_t *cells = fz_calloc(ctx, 1, sizeof(cells_t) + sizeof(cells->cell[0]) * (w * h - 1));
1218
0
  cells->w = w;
1219
0
  cells->h = h;
1220
1221
0
  return cells;
1222
0
}
1223
1224
#ifdef DEBUG_TABLE_STRUCTURE
1225
static void
1226
asciiart_table(grid_walker_data *gd)
1227
{
1228
  int w = gd->xpos->len;
1229
  int h = gd->ypos->len;
1230
  int x, y;
1231
1232
  for (y = 0; y < h; y++)
1233
  {
1234
    for (x = 0; x < w-1; x++)
1235
    {
1236
      cell_t *cell = get_cell(gd->cells, x, y);
1237
      int line = cell->h_line;
1238
      int erase = cell->h_crossed;
1239
      printf("+");
1240
      if (line && !erase)
1241
      {
1242
        printf("-");
1243
      }
1244
      else if (!line && erase)
1245
      {
1246
        printf("v");
1247
      }
1248
      else if (line && erase)
1249
      {
1250
        printf("*");
1251
      }
1252
      else
1253
      {
1254
        printf(" ");
1255
      }
1256
    }
1257
    printf("+\n");
1258
    if (y == h-1)
1259
      break;
1260
    for (x = 0; x < w; x++)
1261
    {
1262
      cell_t *cell = get_cell(gd->cells, x, y);
1263
      int line = cell->v_line;
1264
      int erase = cell->v_crossed;
1265
      if (line && !erase)
1266
      {
1267
        printf("|");
1268
      }
1269
      else if (!line && erase)
1270
      {
1271
        printf(">");
1272
      }
1273
      else if (line && erase)
1274
      {
1275
        printf("*");
1276
      }
1277
      else
1278
      {
1279
        printf(" ");
1280
      }
1281
      if (x < w-1)
1282
      {
1283
        if (cell->full)
1284
          printf("#");
1285
        else
1286
          printf(" ");
1287
      }
1288
      else
1289
        printf("\n");
1290
    }
1291
  }
1292
}
1293
#endif
1294
1295
static void
1296
recalc_bbox(fz_stext_block *block)
1297
0
{
1298
0
  fz_rect bbox = fz_empty_rect;
1299
0
  fz_stext_line *line;
1300
1301
0
  for (line = block->u.t.first_line; line != NULL; line = line->next)
1302
0
    bbox = fz_union_rect(bbox, line->bbox);
1303
1304
0
  block->bbox = bbox;
1305
0
}
1306
1307
static void
1308
unlink_line_from_block(fz_stext_line *line, fz_stext_block *block)
1309
0
{
1310
0
  fz_stext_line *next_line = line->next;
1311
1312
0
  if (line->prev)
1313
0
  {
1314
0
    assert(line->prev->next == line);
1315
0
    line->prev->next = next_line;
1316
0
  }
1317
0
  else
1318
0
  {
1319
0
    assert(block->u.t.first_line == line);
1320
0
    block->u.t.first_line = next_line;
1321
0
  }
1322
0
  if (next_line)
1323
0
  {
1324
0
    assert(next_line->prev == line);
1325
0
    next_line->prev = line->prev;
1326
0
  }
1327
0
  else
1328
0
  {
1329
0
    assert(block->u.t.last_line == line);
1330
0
    block->u.t.last_line = line->prev;
1331
0
  }
1332
0
}
1333
1334
static void
1335
append_line_to_block(fz_stext_line *line, fz_stext_block *block)
1336
0
{
1337
0
  if (block->u.t.last_line == NULL)
1338
0
  {
1339
0
    assert(block->u.t.first_line == NULL);
1340
0
    block->u.t.first_line = block->u.t.last_line = line;
1341
0
    line->prev = NULL;
1342
0
  }
1343
0
  else
1344
0
  {
1345
0
    assert(block->u.t.last_line->next == NULL);
1346
0
    line->prev = block->u.t.last_line;
1347
0
    block->u.t.last_line->next = line;
1348
0
    block->u.t.last_line = line;
1349
0
  }
1350
0
  line->next = NULL;
1351
0
}
1352
1353
static void
1354
unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last)
1355
0
{
1356
0
  if (block->prev)
1357
0
  {
1358
0
    assert(block->prev->next == block);
1359
0
    block->prev->next = block->next;
1360
0
  }
1361
0
  else
1362
0
  {
1363
0
    assert(*first == block);
1364
0
    *first = block->next;
1365
0
  }
1366
0
  if (block->next)
1367
0
  {
1368
0
    assert(block->next->prev == block);
1369
0
    block->next->prev = block->prev;
1370
0
  }
1371
0
  else
1372
0
  {
1373
0
    assert(*last == block);
1374
0
    *last = block->prev;
1375
0
  }
1376
0
}
1377
1378
#ifndef NDEBUG
1379
static int
1380
verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src)
1381
0
{
1382
0
  fz_stext_block *block;
1383
0
  fz_stext_block **first = src ? &src->first_block : &page->first_block;
1384
0
  fz_stext_block **last = src ? &src->last_block : &page->last_block;
1385
0
  int max = 0;
1386
1387
0
  assert((*first == NULL) == (*last == NULL));
1388
1389
0
  for (block = *first; block != NULL; block = block->next)
1390
0
  {
1391
0
    fz_stext_line *line;
1392
1393
0
    if (block->prev == NULL)
1394
0
      assert(*first == block);
1395
0
    else
1396
0
      assert(block->prev->next == block);
1397
0
    if (block->next == NULL)
1398
0
      assert(*last == block);
1399
0
    else
1400
0
      assert(block->next->prev == block);
1401
1402
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
1403
0
    {
1404
0
      if (block->u.s.down)
1405
0
      {
1406
0
        int m = verify_stext(ctx, page, block->u.s.down);
1407
0
        if (m > max)
1408
0
          max = m;
1409
0
      }
1410
0
      continue;
1411
0
    }
1412
0
    if (block->type != FZ_STEXT_BLOCK_TEXT)
1413
0
      continue;
1414
0
    assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL));
1415
0
    for (line = block->u.t.first_line; line != NULL; line = line->next)
1416
0
    {
1417
0
      fz_stext_char *ch;
1418
1419
0
      if (line->next == NULL)
1420
0
        assert(block->u.t.last_line == line);
1421
0
      else
1422
0
        assert(line->next->prev == line);
1423
1424
0
      assert((line->first_char == NULL) == (line->last_char == NULL));
1425
1426
0
      for (ch = line->first_char; ch != NULL; ch = ch->next)
1427
0
        assert(ch->next != NULL || line->last_char == ch);
1428
0
    }
1429
0
  }
1430
1431
0
  return max+1;
1432
0
}
1433
#endif
1434
1435
static fz_rect
1436
move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r)
1437
0
{
1438
0
  fz_stext_block *before = dest ? dest->first_block : page->first_block;
1439
0
  fz_stext_block **sfirst = src ? &src->first_block : &page->first_block;
1440
0
  fz_stext_block **slast = src ? &src->last_block : &page->last_block;
1441
0
  fz_stext_block *block, *next_block;
1442
1443
0
  for (block = *sfirst; block != NULL; block = next_block)
1444
0
  {
1445
0
    fz_rect bbox = fz_intersect_rect(block->bbox, r);
1446
0
    next_block = block->next;
1447
    /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
1448
0
    if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1)
1449
0
      continue; /* Trivially excluded */
1450
0
    if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1)
1451
0
    {
1452
      /* Trivially included */
1453
0
      if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
1454
0
      {
1455
0
        if (block->u.s.down->standard == FZ_STRUCTURE_TD)
1456
0
        {
1457
          /* Don't copy TDs! Just copy the contents */
1458
0
          move_contained_content(ctx, page, dest, block->u.s.down, r);
1459
0
          continue;
1460
0
        }
1461
0
      }
1462
0
      unlink_block(block, sfirst, slast);
1463
0
      insert_block_before(block, before, page, dest);
1464
0
      assert(before == block->next);
1465
0
      continue;
1466
0
    }
1467
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
1468
0
    {
1469
0
      if (block->u.s.down && !all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block))
1470
0
        move_contained_content(ctx, page, dest, block->u.s.down, r);
1471
0
    }
1472
0
    if (block->type == FZ_STEXT_BLOCK_TEXT)
1473
0
    {
1474
      /* Partially included text block */
1475
0
      fz_stext_line *line, *next_line;
1476
0
      fz_stext_block *newblock = NULL;
1477
1478
0
      for (line = block->u.t.first_line; line != NULL; line = next_line)
1479
0
      {
1480
0
        fz_rect lrect = fz_intersect_rect(line->bbox, r);
1481
0
        next_line = line->next;
1482
1483
        /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
1484
0
        if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1)
1485
0
          continue; /* Trivial exclusion */
1486
0
        if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1)
1487
0
        {
1488
          /* Trivial inclusion */
1489
0
          if (newblock == NULL)
1490
0
          {
1491
0
            newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
1492
0
            insert_block_before(newblock, before, page, dest);
1493
0
            assert(before == newblock->next);
1494
0
          }
1495
1496
0
          unlink_line_from_block(line, block);
1497
0
          append_line_to_block(line, newblock);
1498
0
        }
1499
0
        else
1500
0
        {
1501
          /* Need to walk the line and just take parts */
1502
0
          fz_stext_line *newline = NULL;
1503
0
          fz_stext_char *ch, *next_ch, *prev_ch = NULL;
1504
1505
0
          for (ch = line->first_char; ch != NULL; ch = next_ch)
1506
0
          {
1507
0
            fz_rect crect = fz_rect_from_quad(ch->quad);
1508
0
            float x = (crect.x0 + crect.x1)/2;
1509
0
            float y = (crect.y0 + crect.y1)/2;
1510
0
            next_ch = ch->next;
1511
0
            if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y)
1512
0
            {
1513
0
              prev_ch = ch;
1514
0
              continue;
1515
0
            }
1516
            /* Take this char */
1517
0
            if (newline == NULL)
1518
0
            {
1519
0
              newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline));
1520
0
              newline->dir = line->dir;
1521
0
              newline->wmode = line->wmode;
1522
0
              newline->bbox = fz_empty_rect;
1523
0
            }
1524
            /* Unlink char */
1525
0
            if (prev_ch == NULL)
1526
0
              line->first_char = next_ch;
1527
0
            else
1528
0
              prev_ch->next = next_ch;
1529
0
            if (next_ch == NULL)
1530
0
              line->last_char = prev_ch;
1531
            /* Relink char */
1532
0
            ch->next = NULL;
1533
0
            if (newline->last_char == NULL)
1534
0
              newline->first_char = ch;
1535
0
            else
1536
0
              newline->last_char->next = ch;
1537
0
            newline->last_char = ch;
1538
0
            newline->bbox = fz_union_rect(newline->bbox, crect);
1539
0
          }
1540
0
          if (line->first_char == NULL)
1541
0
          {
1542
            /* We've removed all the chars from this line.
1543
             * Better remove the line too. */
1544
0
            if (line->prev)
1545
0
              line->prev->next = next_line;
1546
0
            else
1547
0
              block->u.t.first_line = next_line;
1548
0
            if (next_line)
1549
0
              next_line->prev = line->prev;
1550
0
            else
1551
0
              block->u.t.last_line = line->prev;
1552
0
          }
1553
0
          if (newline)
1554
0
          {
1555
0
            if (newblock == NULL)
1556
0
            {
1557
0
              newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
1558
1559
              /* Add the block onto our target list */
1560
0
              insert_block_before(newblock, before, page, dest);
1561
0
              before = newblock->next;
1562
0
            }
1563
0
            append_line_to_block(newline, newblock);
1564
0
          }
1565
0
        }
1566
0
      }
1567
0
      if (newblock)
1568
0
      {
1569
0
        recalc_bbox(block);
1570
0
        recalc_bbox(newblock);
1571
0
      }
1572
0
      if (block->u.t.first_line == NULL)
1573
0
      {
1574
        /* We've removed all the lines from the block. Should remove that too! */
1575
0
        if (block->prev)
1576
0
        {
1577
0
          assert(block->prev->next == block);
1578
0
          block->prev->next = next_block;
1579
0
        }
1580
0
        else
1581
0
        {
1582
0
          assert(*sfirst == block);
1583
0
          *sfirst = block->next;
1584
0
        }
1585
0
        if (next_block)
1586
0
        {
1587
0
          assert(next_block->prev == block);
1588
0
          next_block->prev = block->prev;
1589
0
        }
1590
0
        else
1591
0
        {
1592
0
          assert(*slast == block);
1593
0
          *slast = block->prev;
1594
0
        }
1595
0
      }
1596
0
    }
1597
0
  }
1598
1599
0
  return r;
1600
0
}
1601
1602
static fz_stext_block *
1603
find_table_insertion_point(fz_context *ctx, fz_rect r, fz_stext_block *block)
1604
0
{
1605
0
  fz_stext_block *after = NULL;
1606
1607
0
  for (; block != NULL; block = block->next)
1608
0
  {
1609
0
    fz_rect s = fz_intersect_rect(r, block->bbox);
1610
1611
0
    if (s.x0 > s.x1 || s.y0 > s.y1)
1612
0
      continue;
1613
0
    after = block;
1614
0
  }
1615
1616
  /* Convert to before */
1617
0
  if (after)
1618
0
    after = after->next;
1619
1620
0
  return after;
1621
0
}
1622
1623
static fz_stext_struct *
1624
transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent)
1625
0
{
1626
0
  int w = gd->xpos->len;
1627
0
  int h = gd->ypos->len;
1628
0
  int x, y;
1629
0
  char *sent_tab = fz_calloc(ctx, 1, w*h);
1630
0
  fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
1631
0
  fz_stext_struct *table, *tr, *td;
1632
0
  fz_stext_block *before;
1633
0
  fz_rect r;
1634
1635
  /* Where should we insert the table in the data? */
1636
0
  r.x0 = gd->xpos->list[0].pos;
1637
0
  r.x1 = gd->xpos->list[w-1].pos;
1638
0
  r.y0 = gd->ypos->list[0].pos;
1639
0
  r.y1 = gd->ypos->list[h-1].pos;
1640
0
  before = find_table_insertion_point(ctx, r, *first_block);
1641
1642
  /* Make table */
1643
0
  table = add_struct_block_before(ctx, before, page, parent, FZ_STRUCTURE_TABLE, "Table");
1644
1645
  /* Run through the cells, and guess at spanning. */
1646
0
  for (y = 0; y < h-1; y++)
1647
0
  {
1648
    /* Have we sent this entire row before? */
1649
0
    for (x = 0; x < w-1; x++)
1650
0
    {
1651
0
      if (!sent_tab[x+y*w])
1652
0
        break;
1653
0
    }
1654
0
    if (x == w-1)
1655
0
      continue; /* No point in sending a row with nothing in it! */
1656
1657
    /* Make TR */
1658
0
    tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR");
1659
1660
0
    for (x = 0; x < w-1; x++)
1661
0
    {
1662
0
      int x2, y2;
1663
0
      int cellw = 1;
1664
0
      int cellh = 1;
1665
1666
      /* Have we sent this cell already? */
1667
0
      if (sent_tab[x+y*w])
1668
0
        continue;
1669
1670
      /* Find the width of the cell */
1671
0
      for (x2 = x+1; x2 < w-1; x2++)
1672
0
      {
1673
0
        cell_t *cell = get_cell(gd->cells, x2, y);
1674
0
        if (cell->v_line)
1675
0
          break; /* Can't go past a line */
1676
0
        if (gd->xpos->list[x2].uncertainty == 0)
1677
0
          break; /* An uncertainty of 0 is as good as a line. */
1678
0
        if (!cell->v_crossed)
1679
0
          break;
1680
0
        cellw++;
1681
0
      }
1682
      /* Find the height of the cell */
1683
0
      for (y2 = y+1; y2 < h-1; y2++)
1684
0
      {
1685
0
        cell_t *cell;
1686
0
        int h_crossed = 0;
1687
0
        if (gd->ypos->list[y2].uncertainty == 0)
1688
0
          break; /* An uncertainty of 0 is as good as a line. */
1689
1690
0
        cell = get_cell(gd->cells, x, y2);
1691
0
        if (cell->h_line)
1692
0
          break; /* Can't extend down through a line. */
1693
0
        if (cell->h_crossed)
1694
0
          h_crossed = 1;
1695
0
        for (x2 = x+1; x2 < x+cellw; x2++)
1696
0
        {
1697
0
          cell_t *cell = get_cell(gd->cells, x2, y2);
1698
0
          if (cell->h_line)
1699
0
            break;
1700
0
          if (cell->v_line)
1701
0
            break; /* Can't go past a line */
1702
0
          if (gd->xpos->list[x2].uncertainty == 0)
1703
0
            break; /* An uncertainty of 0 is as good as a line. */
1704
0
          if (!cell->v_crossed)
1705
0
            break;
1706
0
          if (cell->h_crossed)
1707
0
            h_crossed = 1;
1708
0
        }
1709
0
        if (x2 == x+cellw && h_crossed)
1710
0
          cellh++;
1711
0
        else
1712
0
          break;
1713
0
      }
1714
      /* Make TD */
1715
0
      td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD");
1716
0
      r.x0 = gd->xpos->list[x].pos;
1717
0
      r.x1 = gd->xpos->list[x+cellw].pos;
1718
0
      r.y0 = gd->ypos->list[y].pos;
1719
0
      r.y1 = gd->ypos->list[y+cellh].pos;
1720
      /* Use r, not REAL contents bbox, as otherwise spanned rows
1721
       * can end up empty. */
1722
0
      td->up->bbox = r;
1723
0
      move_contained_content(ctx, page, td, parent, r);
1724
#ifdef DEBUG_TABLE_STRUCTURE
1725
      printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh);
1726
#endif
1727
0
      for (y2 = y; y2 < y+cellh; y2++)
1728
0
        for (x2 = x; x2 < x+cellw; x2++)
1729
0
          sent_tab[x2+y2*w] = 1;
1730
0
    }
1731
0
    r.x0 = gd->xpos->list[0].pos;
1732
0
    r.x1 = gd->xpos->list[gd->xpos->len-1].pos;
1733
0
    r.y0 = gd->ypos->list[y].pos;
1734
0
    r.y1 = gd->ypos->list[y+1].pos;
1735
0
    tr->up->bbox = r;
1736
0
    table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox);
1737
0
  }
1738
0
  fz_free(ctx, sent_tab);
1739
1740
0
  return table;
1741
0
}
1742
1743
static void
1744
merge_column(grid_walker_data *gd, int x)
1745
0
{
1746
0
  int y;
1747
0
  for (y = 0; y < gd->cells->h; y++)
1748
0
  {
1749
0
    cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)];
1750
0
    cell_t *s = &gd->cells->cell[x + y * gd->cells->w];
1751
1752
0
    if (x > 0)
1753
0
      memcpy(d-x, s-x, x * sizeof(*d));
1754
0
    d->full = s[0].full || s[1].full;
1755
0
    d->h_crossed = s[0].h_crossed || s[1].h_crossed;
1756
0
    d->h_line = s[0].h_line; /* == s[1].h_line */
1757
0
    d->v_crossed = s[0].v_crossed;
1758
0
    d->v_line = s[0].v_line;
1759
0
    if (x < gd->cells->w - 2)
1760
0
      memcpy(d+1, s+2, (gd->cells->w - 2 - x) * sizeof(*d));
1761
0
  }
1762
0
  gd->cells->w--;
1763
1764
0
  if (x < gd->xpos->len - 2)
1765
0
    memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], (gd->xpos->len - 2 - x) * sizeof(gd->xpos->list[0]));
1766
0
  gd->xpos->len--;
1767
0
}
1768
1769
static void
1770
merge_columns(grid_walker_data *gd)
1771
0
{
1772
0
  int x, y;
1773
1774
0
  for (x = gd->cells->w-3; x >= 0; x--)
1775
0
  {
1776
    /* Can column x be merged with column x+1? */
1777
    /* We never merge 2 columns where the uncertainty of the division between them is 0. */
1778
0
    if (gd->xpos->list[x+1].uncertainty == 0)
1779
0
      break;
1780
    /* This requires all the pairs of cells in those 2 columns to be mergeable. */
1781
0
    for (y = 0; y < gd->cells->h-1; y++)
1782
0
    {
1783
0
      cell_t *a = get_cell(gd->cells, x, y);
1784
0
      cell_t *b = get_cell(gd->cells, x+1, y);
1785
      /* If there is a divider, we can't merge. */
1786
0
      if (b->v_line)
1787
0
        break;
1788
      /* If either is empty, we can merge. */
1789
0
      if (!a->full || !b->full)
1790
0
        continue;
1791
      /* If we differ in h linedness, we can't merge */
1792
0
      if (!!a->h_line != !!b->h_line)
1793
0
        break;
1794
      /* If both are full, we can only merge if we cross. */
1795
0
      if (a->full && b->full && b->v_crossed)
1796
0
        continue;
1797
      /* Otherwise we can't merge */
1798
0
      break;
1799
0
    }
1800
0
    if (y == gd->cells->h-1)
1801
0
    {
1802
      /* Merge the column! */
1803
#ifdef DEBUG_TABLE_STRUCTURE
1804
      printf("Merging column %d\n", x);
1805
#endif
1806
0
      merge_column(gd, x);
1807
#ifdef DEBUG_TABLE_STRUCTURE
1808
      asciiart_table(gd);
1809
#endif
1810
0
    }
1811
0
  }
1812
0
}
1813
1814
static void
1815
merge_row(grid_walker_data *gd, int y)
1816
0
{
1817
0
  int x;
1818
0
  int w = gd->cells->w;
1819
0
  cell_t *d = &gd->cells->cell[y * w];
1820
0
  for (x = 0; x < gd->cells->w-1; x++)
1821
0
  {
1822
0
    if (d->full == 0)
1823
0
      d->full = d[w].full;
1824
0
    if (d->h_crossed == 0)
1825
0
      d->h_crossed = d[w].h_crossed;
1826
0
    d++;
1827
0
  }
1828
0
  if (y < gd->cells->h - 2)
1829
0
    memcpy(d, d+w, (gd->cells->h - 2 - y) * w * sizeof(*d));
1830
0
  gd->cells->h--;
1831
1832
0
  if (y < gd->ypos->len - 2)
1833
0
    memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], (gd->ypos->len - 2 - y) * sizeof(gd->ypos->list[0]));
1834
0
  gd->ypos->len--;
1835
0
}
1836
1837
static void
1838
merge_rows(grid_walker_data *gd)
1839
0
{
1840
0
  int x, y;
1841
1842
0
  for (y = gd->cells->h-3; y >= 0; y--)
1843
0
  {
1844
    /* Can row y be merged with row y+1? */
1845
    /* We never merge 2 rows where the uncertainty of the division between them is 0. */
1846
0
    if (gd->ypos->list[y+1].uncertainty == 0)
1847
0
      break;
1848
    /* This requires all the pairs of cells in those 2 rows to be mergeable. */
1849
0
    for (x = 0; x < gd->cells->w-1; x++)
1850
0
    {
1851
0
      cell_t *a = get_cell(gd->cells, x, y);
1852
0
      cell_t *b = get_cell(gd->cells, x, y+1);
1853
      /* If there is a divider, we can't merge. */
1854
0
      if (b->h_line)
1855
0
        break;
1856
      /* If either is empty, we can merge. */
1857
0
      if (!a->full || !b->full)
1858
0
        continue;
1859
      /* If we differ in v linedness, we can't merge */
1860
0
      if (!!a->v_line != !!b->v_line)
1861
0
        break;
1862
      /* If both are full, we can only merge if we cross. */
1863
0
      if (a->full && b->full && b->h_crossed)
1864
0
        continue;
1865
      /* Otherwise we can't merge */
1866
0
      break;
1867
0
    }
1868
0
    if (x == gd->cells->w-1)
1869
0
    {
1870
      /* Merge the row! */
1871
#ifdef DEBUG_TABLE_STRUCTURE
1872
      printf("Merging row %d\n", y);
1873
#endif
1874
0
      merge_row(gd, y);
1875
#ifdef DEBUG_TABLE_STRUCTURE
1876
      asciiart_table(gd);
1877
#endif
1878
0
    }
1879
0
  }
1880
0
}
1881
1882
static int
1883
tr_is_empty(fz_context *ctx, fz_stext_block *block)
1884
0
{
1885
0
  for (; block != NULL; block = block->next)
1886
0
  {
1887
0
    if (block->type != FZ_STEXT_BLOCK_STRUCT)
1888
0
      return 0;
1889
0
    if (!block->u.s.down)
1890
0
    {
1891
0
    }
1892
0
    else if (block->u.s.down->standard != FZ_STRUCTURE_TD)
1893
0
      return 0;
1894
0
    else if (block->u.s.down->first_block != NULL)
1895
0
      return 0;
1896
0
  }
1897
1898
0
  return 1;
1899
0
}
1900
1901
static int
1902
table_is_empty(fz_context *ctx, fz_stext_block *block)
1903
0
{
1904
0
  for (; block != NULL; block = block->next)
1905
0
  {
1906
0
    if (block->type == FZ_STEXT_BLOCK_GRID)
1907
0
      continue;
1908
0
    if (block->type != FZ_STEXT_BLOCK_STRUCT)
1909
0
      return 0;
1910
0
    if (!block->u.s.down)
1911
0
    {
1912
0
    }
1913
0
    else if (block->u.s.down->standard != FZ_STRUCTURE_TR)
1914
0
      return 0;
1915
0
    else if (!tr_is_empty(ctx, block->u.s.down->first_block))
1916
0
      return 0;
1917
0
  }
1918
1919
0
  return 1;
1920
0
}
1921
1922
static void
1923
tidy_td_divs(fz_context *ctx, fz_stext_struct *parent)
1924
0
{
1925
0
  while (1)
1926
0
  {
1927
0
    fz_stext_block *block = parent->first_block;
1928
1929
0
    if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV)
1930
0
      return;
1931
1932
0
    parent->first_block = block->u.s.down->first_block;
1933
0
    parent->last_block = block->u.s.down->last_block;
1934
0
  }
1935
0
}
1936
1937
static void
1938
tidy_tr_divs(fz_context *ctx, fz_stext_block *block)
1939
0
{
1940
0
  for (; block != NULL; block = block->next)
1941
0
  {
1942
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD)
1943
0
      tidy_td_divs(ctx, block->u.s.down);
1944
0
  }
1945
0
}
1946
1947
static void
1948
tidy_table_divs(fz_context *ctx, fz_stext_block *block)
1949
0
{
1950
0
  for (; block != NULL; block = block->next)
1951
0
  {
1952
0
    if (block->type == FZ_STEXT_BLOCK_GRID)
1953
0
      continue;
1954
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR)
1955
0
      tidy_tr_divs(ctx, block->u.s.down->first_block);
1956
0
  }
1957
0
}
1958
1959
static int
1960
div_is_empty(fz_context *ctx, fz_stext_block *block)
1961
0
{
1962
0
  for (; block != NULL; block = block->next)
1963
0
  {
1964
0
    if (block->type != FZ_STEXT_BLOCK_STRUCT)
1965
0
      return 0;
1966
0
    if (!block->u.s.down)
1967
0
    {
1968
0
    }
1969
0
    else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
1970
0
    {
1971
0
      tidy_table_divs(ctx, block->u.s.down->first_block);
1972
0
      return table_is_empty(ctx, block->u.s.down->first_block);
1973
0
    }
1974
0
    else if (block->u.s.down->standard != FZ_STRUCTURE_DIV)
1975
0
      return 0;
1976
0
    else if (!div_is_empty(ctx, block->u.s.down->first_block))
1977
0
      return 0;
1978
0
  }
1979
1980
0
  return 1;
1981
0
}
1982
1983
static void
1984
tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
1985
0
{
1986
0
  fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block;
1987
0
  fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block;
1988
0
  fz_stext_block *block, *next_block;
1989
1990
0
  for (block = *first_blockp; block != NULL; block = next_block)
1991
0
  {
1992
0
    next_block = block->next;
1993
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
1994
0
    {
1995
0
      if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
1996
0
      {
1997
0
        tidy_table_divs(ctx, block->u.s.down->first_block);
1998
0
        if (table_is_empty(ctx, block->u.s.down->first_block))
1999
0
        {
2000
          /* Remove block */
2001
0
          if (block->prev)
2002
0
          {
2003
0
            assert(block->prev->next == block);
2004
0
            block->prev->next = next_block;
2005
0
          }
2006
0
          else
2007
0
          {
2008
0
            assert(*first_blockp == block);
2009
0
            *first_blockp = next_block;
2010
0
          }
2011
0
          if (next_block)
2012
0
          {
2013
0
            assert(next_block->prev == block);
2014
0
            next_block->prev = block->prev;
2015
0
          }
2016
0
          else
2017
0
          {
2018
0
            assert(*last_blockp == block);
2019
0
            *last_blockp = block->prev;
2020
0
          }
2021
0
        }
2022
0
        else
2023
0
        {
2024
0
          tidy_orphaned_tables(ctx, page, block->u.s.down);
2025
0
        }
2026
0
      }
2027
0
      if (block->u.s.down->standard == FZ_STRUCTURE_DIV)
2028
0
      {
2029
0
        tidy_orphaned_tables(ctx, page, block->u.s.down);
2030
0
        if (div_is_empty(ctx, block->u.s.down->first_block))
2031
0
        {
2032
          /* Remove block */
2033
0
          if (block->prev)
2034
0
          {
2035
0
            assert(block->prev->next == block);
2036
0
            block->prev->next = next_block;
2037
0
          }
2038
0
          else
2039
0
          {
2040
0
            assert(*first_blockp == block);
2041
0
            *first_blockp = next_block;
2042
0
          }
2043
0
          if (next_block)
2044
0
          {
2045
0
            assert(next_block->prev == block);
2046
0
            next_block->prev = block->prev;
2047
0
          }
2048
0
          else
2049
0
          {
2050
0
            assert(*last_blockp == block);
2051
0
            *last_blockp = block->prev;
2052
0
          }
2053
0
        }
2054
0
      }
2055
0
    }
2056
0
  }
2057
0
}
2058
2059
static fz_stext_struct *
2060
check_for_grid_lines(fz_context *ctx, fz_stext_grid_positions *xps, fz_stext_grid_positions *yps, fz_stext_page *page, fz_stext_struct *parent, int num_subtables)
2061
0
{
2062
0
  fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block;
2063
0
  grid_walker_data gd = { 0 };
2064
0
  fz_stext_struct *table = NULL;
2065
2066
0
  gd.xpos = xps;
2067
0
  gd.ypos = yps;
2068
2069
0
  fz_var(gd);
2070
2071
0
  fz_try(ctx)
2072
0
  {
2073
0
    gd.cells = new_cells(ctx, xps->len, yps->len);
2074
2075
    /* First we walk the content looking for grid lines. These
2076
     * lines refine our positions. */
2077
0
    walk_grid_lines(ctx, &gd, *first_blockp);
2078
    /* Now, we walk the content looking for content that crosses
2079
     * these grid lines. This allows us to spot spanned cells. */
2080
0
    erase_grid_lines(ctx, &gd, *first_blockp);
2081
2082
#ifdef DEBUG_TABLE_STRUCTURE
2083
    asciiart_table(&gd);
2084
#endif
2085
    /* Now, can we remove some columns or rows? i.e. have we oversegmented? */
2086
0
    merge_columns(&gd);
2087
0
    merge_rows(&gd);
2088
2089
    /* Did we shrink the table so much it's not a table any more? */
2090
0
    if (gd.xpos->len < 3 || gd.ypos->len < 3)
2091
0
      break;
2092
2093
0
    if (num_subtables > 0)
2094
0
    {
2095
      /* We are risking throwing away a table we've already found for this
2096
       * one. Only do it if this one is really convincing. */
2097
0
      int x, y;
2098
0
      for (x = 0; x < gd.xpos->len; x++)
2099
0
        if (gd.xpos->list[x].uncertainty != 0)
2100
0
          break;
2101
0
      if (x != gd.xpos->len)
2102
0
        break;
2103
0
      for (y = 0; y < gd.ypos->len; y++)
2104
0
        if (gd.ypos->list[y].uncertainty != 0)
2105
0
          break;
2106
0
      if (y != gd.ypos->len)
2107
0
        break;
2108
0
    }
2109
2110
#ifdef DEBUG_TABLE_STRUCTURE
2111
    printf("Transcribing table: (%g,%g)->(%g,%g)\n",
2112
      gd.xpos->list[0].pos,
2113
      gd.ypos->list[0].pos,
2114
      gd.xpos->list[gd.xpos->len-1].pos,
2115
      gd.ypos->list[gd.ypos->len-1].pos);
2116
#endif
2117
2118
    /* Now we should have the entire table calculated. */
2119
0
    table = transcribe_table(ctx, &gd, page, parent);
2120
2121
0
    tidy_orphaned_tables(ctx, page, parent);
2122
0
  }
2123
0
  fz_always(ctx)
2124
0
  {
2125
0
    fz_free(ctx, gd.cells);
2126
0
  }
2127
0
  fz_catch(ctx)
2128
0
    fz_rethrow(ctx);
2129
2130
0
  return table;
2131
0
}
2132
2133
static fz_rect
2134
bbox_of_blocks(fz_stext_block *block)
2135
0
{
2136
0
  fz_rect r = fz_empty_rect;
2137
2138
0
  while (block)
2139
0
  {
2140
0
    r = fz_union_rect(r, block->bbox);
2141
0
    block = block->next;
2142
0
  }
2143
2144
0
  return r;
2145
0
}
2146
2147
static int
2148
do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
2149
0
{
2150
0
  div_list xs = { 0 };
2151
0
  div_list ys = { 0 };
2152
0
  fz_stext_block *block;
2153
0
  int count;
2154
0
  fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
2155
0
  fz_stext_grid_positions *xps = NULL;
2156
0
  fz_stext_grid_positions *yps = NULL;
2157
0
  int num_subtables = 0;
2158
2159
  /* No content? Just bale. */
2160
0
  if (*first_block == NULL)
2161
0
    return 0;
2162
2163
  /* First off, descend into any children to see if those look like tables. */
2164
0
  count = 0;
2165
0
  for (block = *first_block; block != NULL; block = block->next)
2166
0
  {
2167
0
    if (block->type == FZ_STEXT_BLOCK_STRUCT)
2168
0
    {
2169
0
      if (block->u.s.down)
2170
0
      {
2171
0
        num_subtables += do_table_hunt(ctx, page, block->u.s.down);
2172
0
        count++;
2173
0
      }
2174
0
    }
2175
0
    else if (block->type == FZ_STEXT_BLOCK_TEXT)
2176
0
      count++;
2177
0
  }
2178
2179
  /* If we don't have at least a single child, no more to hunt. */
2180
  /* We only need a single block, because a single text block can
2181
   * contain an entire unbordered table. */
2182
0
  if (count < 1)
2183
0
    return num_subtables;
2184
2185
  /* We only look for a table at this level if we either at the top
2186
   * or on a div. This saves us looking for tables within an 'H'
2187
   * for example. */
2188
0
  if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV)
2189
0
    return num_subtables;
2190
2191
  /* If all the content here looks like a column of text, don't
2192
   * hunt for a table within it. */
2193
0
  if (all_blocks_are_justified_or_headers(ctx, *first_block))
2194
0
    return num_subtables;
2195
2196
0
  fz_var(xps);
2197
0
  fz_var(yps);
2198
2199
0
  fz_try(ctx)
2200
0
  {
2201
    /* Now see whether the content looks like tables.
2202
     * Currently, we pass descend == 1, which means we consider all the
2203
     * content at this level, plus any children. This allows us to cope
2204
     * with having oversegmented. */
2205
0
    walk_blocks(ctx, &xs, &ys, *first_block, 1);
2206
2207
0
    sanitize_positions(ctx, &xs);
2208
0
    sanitize_positions(ctx, &ys);
2209
2210
    /* Run across the line, counting 'winding' */
2211
0
    if (xs.len > 2 && ys.len > 2)
2212
0
    {
2213
0
      fz_stext_struct *table;
2214
0
      fz_rect rect = bbox_of_blocks(*first_block);
2215
0
      xps = make_table_positions(ctx, &xs, rect.x0, rect.x1);
2216
0
      yps = make_table_positions(ctx, &ys, rect.y0, rect.y1);
2217
0
      table = check_for_grid_lines(ctx, xps, yps, page, parent, num_subtables);
2218
2219
0
      if (table != NULL)
2220
0
      {
2221
0
        fz_stext_block *block;
2222
0
        fz_stext_grid_positions *xps2 = clone_grid_positions(ctx, page, xps);
2223
0
        fz_stext_grid_positions *yps2 = clone_grid_positions(ctx, page, yps);
2224
0
        block = add_grid_block(ctx, page, &table->first_block, &table->last_block);
2225
0
        block->u.b.xs = xps2;
2226
0
        block->u.b.ys = yps2;
2227
0
        block->bbox.x0 = block->u.b.xs->list[0].pos;
2228
0
        block->bbox.y0 = block->u.b.ys->list[0].pos;
2229
0
        block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos;
2230
0
        block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos;
2231
0
        num_subtables = 1;
2232
0
      }
2233
#ifdef DEBUG_WRITE_AS_PS
2234
      {
2235
        int i;
2236
        printf("%% TABLE\n");
2237
        for (i = 0; i < block->u.b.xs->len; i++)
2238
        {
2239
          if (block->u.b.xs->list[i].uncertainty)
2240
            printf("0 1 0 setrgbcolor\n");
2241
          else
2242
            printf("0 0.5 0 setrgbcolor\n");
2243
          printf("%g %g moveto %g %g lineto stroke\n",
2244
            block->u.b.xs->list[i].pos, block->bbox.y0,
2245
            block->u.b.xs->list[i].pos, block->bbox.y1);
2246
        }
2247
        for (i = 0; i < block->u.b.ys->len; i++)
2248
        {
2249
          if (block->u.b.ys->list[i].uncertainty)
2250
            printf("0 1 0 setrgbcolor\n");
2251
          else
2252
            printf("0 0.5 0 setrgbcolor\n");
2253
          printf("%g %g moveto %g %g lineto stroke\n",
2254
            block->bbox.x0, block->u.b.ys->list[i].pos,
2255
            block->bbox.x1, block->u.b.ys->list[i].pos);
2256
        }
2257
      }
2258
#endif
2259
0
    }
2260
0
  }
2261
0
  fz_always(ctx)
2262
0
  {
2263
0
    fz_free(ctx, xs.list);
2264
0
    fz_free(ctx, ys.list);
2265
0
    fz_free(ctx, xps);
2266
0
    fz_free(ctx, yps);
2267
0
  }
2268
0
  fz_catch(ctx)
2269
0
    fz_rethrow(ctx);
2270
2271
0
  return num_subtables;
2272
0
}
2273
2274
void
2275
fz_table_hunt(fz_context *ctx, fz_stext_page *page)
2276
0
{
2277
0
  if (page == NULL)
2278
0
    return;
2279
2280
0
  assert(verify_stext(ctx, page, NULL));
2281
2282
0
  do_table_hunt(ctx, page, NULL);
2283
2284
0
  assert(verify_stext(ctx, page, NULL));
2285
0
}