/src/mupdf/source/fitz/stext-table.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2025 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | |
25 | | #include <assert.h> |
26 | | |
27 | | /* #define DEBUG_WRITE_AS_PS */ |
28 | | |
29 | | /* #define DEBUG_TABLE_STRUCTURE */ |
30 | | |
31 | | /* #define DEBUG_TABLE_HUNT */ |
32 | | |
33 | | /* |
34 | | * The algorithm. |
35 | | * |
36 | | * The goal of the algorithm is to identify tables on a page. |
37 | | * First we have to find the tables on a page, then we have to |
38 | | * figure out where the columns/rows are, and then how the |
39 | | * cells span them. |
40 | | * |
41 | | * We do this as a series of steps. |
42 | | * |
43 | | * To illustrate what's going on, let's use an example page |
44 | | * that we can follow through all the steps. |
45 | | * |
46 | | * +---------------------------+ |
47 | | * | | |
48 | | * | #### ## ### ## | <- Title |
49 | | * | | |
50 | | * | ##### ##### #### ## | \ |
51 | | * | ## ###### ###### ## | | |
52 | | * | #### ####### ###### | |- Abstract |
53 | | * | ####### #### ## ### | | |
54 | | * | ### ##### ###### | / |
55 | | * | | |
56 | | * | ######### ######### | 2 Columns of text |
57 | | * | ######### ######### | |
58 | | * | ######## ######### | |
59 | | * | ######### | |
60 | | * | +-------+ ####### | <- With an image on the left |
61 | | * | | | | |
62 | | * | | | ## ## # # | <- And a table on the right |
63 | | * | +-------+ ## ## # # | |
64 | | * | ## ## # # | |
65 | | * | ######### ## ## # # | |
66 | | * | ######### ## ## # # | |
67 | | * | ######### ## ## # # | |
68 | | * | | |
69 | | * +---------------------------+ |
70 | | * |
71 | | * |
72 | | * Step 1: Segmentation. |
73 | | * |
74 | | * First, we segment the page, trying to break it down into a |
75 | | * series of non-overlapping rectangles. We do this (in stext-boxer.c) |
76 | | * by looking for where the content isn't. If we can identify breaks |
77 | | * that run through the page (either from top to bottom or from left |
78 | | * to right), then we can split the page there, and recursively consider |
79 | | * the two halves of the page. |
80 | | * |
81 | | * It's not a perfect algorithm, but it manages to in many cases. |
82 | | * |
83 | | * After segmenting the above example, first we'll find the horizontal |
84 | | * splits, giving: |
85 | | * |
86 | | * +---------------------------+ |
87 | | * | | |
88 | | * | #### ## ### ## | |
89 | | * +---------------------------+ |
90 | | * | ##### ##### #### ## | |
91 | | * | ## ###### ###### ## | |
92 | | * | #### ####### ###### | |
93 | | * | ####### #### ## ### | |
94 | | * | ### ##### ###### | |
95 | | * +---------------------------+ |
96 | | * | ######### ######### | |
97 | | * | ######### ######### | |
98 | | * | ######## ######### | |
99 | | * | ######### | |
100 | | * | +-------+ ####### | |
101 | | * | | | | |
102 | | * | | | ## ## # # | |
103 | | * | +-------+ ## ## # # | |
104 | | * | ## ## # # | |
105 | | * | ######### ## ## # # | |
106 | | * | ######### ## ## # # | |
107 | | * | ######### ## ## # # | |
108 | | * | | |
109 | | * +---------------------------+ |
110 | | * |
111 | | * Then we'll recurse and find the vertical split between |
112 | | * the columns: |
113 | | * |
114 | | * +---------------------------+ |
115 | | * | | |
116 | | * | #### ## ### ## | |
117 | | * +---------------------------+ |
118 | | * | ##### ##### #### ## | |
119 | | * | ## ###### ###### ## | |
120 | | * | #### ####### ###### | |
121 | | * | ####### #### ## ### | |
122 | | * | ### ##### ###### | |
123 | | * +-------------+-------------+ |
124 | | * | ######### | ######### | |
125 | | * | ######### | ######### | |
126 | | * | ######## | ######### | |
127 | | * | | ######### | |
128 | | * | +-------+ | ####### | |
129 | | * | | | | | |
130 | | * | | | | ## ## # # | |
131 | | * | +-------+ | ## ## # # | |
132 | | * | | ## ## # # | |
133 | | * | ######### | ## ## # # | |
134 | | * | ######### | ## ## # # | |
135 | | * | ######### | ## ## # # | |
136 | | * | | | |
137 | | * +-------------+-------------+ |
138 | | * |
139 | | * Then we recurse again and find the horizontal splits |
140 | | * within the columns: |
141 | | * |
142 | | * +---------------------------+ |
143 | | * | | |
144 | | * | #### ## ### ## | |
145 | | * +---------------------------+ |
146 | | * | ##### ##### #### ## | |
147 | | * | ## ###### ###### ## | |
148 | | * | #### ####### ###### | |
149 | | * | ####### #### ## ### | |
150 | | * | ### ##### ###### | |
151 | | * +-------------+-------------+ |
152 | | * | ######### | ######### | |
153 | | * | ######### | ######### | |
154 | | * | ######## | ######### | |
155 | | * +-------------+ ######### | |
156 | | * | +-------+ | ####### | |
157 | | * | | | +-------------+ |
158 | | * | | | | ## ## # # | |
159 | | * | +-------+ | ## ## # # | |
160 | | * +-------------+ ## ## # # | |
161 | | * | ######### | ## ## # # | |
162 | | * | ######### | ## ## # # | |
163 | | * | ######### | ## ## # # | |
164 | | * | | | |
165 | | * +-------------+-------------+ |
166 | | * |
167 | | * We recurse a fixed maximum number of times (currently |
168 | | * 6, IIRC) or until we fail to find any suitable splits. |
169 | | * |
170 | | * This completes the page segmentation step. |
171 | | * |
172 | | * Step 2: Grid finding |
173 | | * |
174 | | * Next, we look at each of those segments and try to identify |
175 | | * where grids might be. |
176 | | * |
177 | | * Imagine the bottom right section of that page as |
178 | | * a board with lego blocks on where there is text. |
179 | | * Now imagine viewing that from the bottom of the page. |
180 | | * The gaps between the columns of the table are where you |
181 | | * can see through to the top between the blocks. |
182 | | * |
183 | | * Similarly, if you view it from the side, the gaps between the |
184 | | * rows of the page are where you can see through to the other |
185 | | * side. |
186 | | * |
187 | | * So, how do we code that? Well, we run through the page content |
188 | | * (obviously, restricted to the content that falls into this |
189 | | * segment of the page - that'll go without saying from here on |
190 | | * in). For each bit of content, we look at the "x extent" of that |
191 | | * content - for instance a given string might start at position |
192 | | * 10 and continue to position 100. We build a list of all these |
193 | | * start, and stop positions, and keep them in a sorted list. |
194 | | * |
195 | | * Then we walk this list from left to right, keeping a sum. I |
196 | | * call this sum "wind", because it's very similar to the winding |
197 | | * number that you get when doing scan conversion of bezier shapes. |
198 | | * |
199 | | * wind starts out as 0. We increment it whenever we pass a 'start' |
200 | | * position, and decrement it whenever we pass a 'stop' position. |
201 | | * So at any given x position along the line wind tells us the |
202 | | * number of pieces of content that overlap that x position. |
203 | | * So wind(left) = 0 = wind(right), and wind(x) >= x for all x. |
204 | | * |
205 | | * So, if we walk from left to right, the trace of wind might |
206 | | * look something like: |
207 | | * |
208 | | * __ |
209 | | * ___ / \ _ __ |
210 | | * / \ / \/ \ _/ \_ |
211 | | * / \___/ \___/ \ |
212 | | * |
213 | | * The left and right edges of the table are pretty clear. |
214 | | * The regions where wind drops to 0 represent the column dividers. |
215 | | * The left and right hand side of those regions gives us the min |
216 | | * and max values for that divider. |
217 | | * |
218 | | * We can then repeat this process for Y ranges instead of X ranges |
219 | | * to get the row dividers. |
220 | | * |
221 | | * BUT, this only works for pure grid tables. It falls down for |
222 | | * cases where we have merged cells (which is very common due to |
223 | | * titles etc). |
224 | | * |
225 | | * We can modify the algorithm slightly to allow for this. |
226 | | * |
227 | | * Consider the following table: |
228 | | * |
229 | | * +-----------------------------------+ |
230 | | * | Long Table title across the top | |
231 | | * +---------------+---------+---------+ |
232 | | * | Name | Result1 | Result2 | |
233 | | * +---------------+----+----+----+----+ |
234 | | * | Homer Simpson | 1 | 23 | 4 | 56 | |
235 | | * | Barney Gumble | 1 | 23 | 4 | 56 | |
236 | | * | Moe | 1 | 23 | 4 | 56 | |
237 | | * | Apu | 1 | 23 | 4 | 56 | |
238 | | * | Ned Flanders | 1 | 23 | 4 | 56 | |
239 | | * +---------------+----+----+----+----+ |
240 | | * |
241 | | * The wind trace for that looks something like |
242 | | * (with a certain degree of artistic license for the |
243 | | * limitations of ascii art): |
244 | | * |
245 | | * ________ |
246 | | * / \ _ __ _ _ |
247 | | * / \____/ \_/ \___/ \_/ \ |
248 | | * / \ |
249 | | * |
250 | | * So, the trace never quite drops back to zero in the |
251 | | * middle due to the spanning of the top title. |
252 | | * |
253 | | * So, instead of just looking for points where the trace |
254 | | * drops to zero, we instead look for local minima. Each local |
255 | | * minima represents a place where there might be a grid dividier. |
256 | | * The value of wind at such points can be considered the |
257 | | * "uncertainty" with which there might be a divider there. |
258 | | * Clear dividers (with a wind value of 0) have no uncertainty. |
259 | | * Places where cells are spanned have a higher value of uncertainty. |
260 | | * |
261 | | * The output from this step is the list of possible grid positions |
262 | | * (X and Y), with uncertainty values. |
263 | | * |
264 | | * Step 3: Cell analysis |
265 | | * |
266 | | * So, armed with the output from step 2, we can examine each grid |
267 | | * found. If we have W x-dividers and H y-dividers, we know we have |
268 | | * a potential table with (W-1) x (H-1) cells in it. |
269 | | * |
270 | | * We represent this as a W x H grid of cells, each like: |
271 | | * |
272 | | * . . |
273 | | * . . |
274 | | * . . .+-------+. . . Each cell holds information about the |
275 | | * | . edges above, and to the left of it. |
276 | | * | . |
277 | | * | . |
278 | | * . . .+. . . .+. . . |
279 | | * . . |
280 | | * . . |
281 | | * |
282 | | * h_line: Is there a horizontal divider drawn on the page that |
283 | | * corresponds to the top of this cell (i.e. is there a cell border |
284 | | * here?) |
285 | | * v_line: Is there a vertical divider drawn on the page that |
286 | | * corresponds to the left of this cell (i.e. is there a cell border |
287 | | * here?) |
288 | | * h_crossed: Does content cross this line (i.e. are we merged |
289 | | * with the cell above?) |
290 | | * v_crossed: Does content cross this line (i.e. are we merged |
291 | | * with the cell to the left?) |
292 | | * full: Is there any content in this cell at all? |
293 | | * |
294 | | * We need a W x H grid of cells to represent the entire table due |
295 | | * to the potential right and bottom edge lines. The right and |
296 | | * bottom rows of cells should never be full, or be crossed, but |
297 | | * it's easiest just to use a simple representation that copes with |
298 | | * the h_line and v_line values naturally. |
299 | | * |
300 | | * So, we start with the cells structure empty, and we run through |
301 | | * the page content, filling in the details as we go. |
302 | | * |
303 | | * At the end of the process, we have enough information to draw |
304 | | * an asciiart representation of our table. It might look something |
305 | | * like this (this comes from dotted-gridlines-tables.pdf): |
306 | | * |
307 | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+ |
308 | | * | | | | | |#|#|#|#| | | | |
309 | | * + + + + + + +v+ +v+v+ + + + |
310 | | * | | | | | |#|#|#|#| | | | |
311 | | * + + + + + + + +v+ + + + + + |
312 | | * | |#| | |#|#|#|#|#|#|#|#| |
313 | | * + +v+ + + +v+v+ +v+v+v+v+v+ |
314 | | * | |#| |#|#|#|#|#|#|#|#|#| |
315 | | * + + + + +v+ + +v+ + + + + + |
316 | | * |#|#| #|#|#|#|#|#|#|#|#|#| |
317 | | * +v+v+ +v+ +v+v+ +v+v+v+v+v+ |
318 | | * |#|#| #|#|#|#|#|#|#|#|#|#| |
319 | | * + + + + +v+ + +v+ + + + + + |
320 | | * | |#| |#|#|#|#|#|#|#|#|#| |
321 | | * + +v+ + + +v+v+ +v+v+v+v+v+ |
322 | | * | |#| | |#|#|#|#|#|#|#|#| |
323 | | * + + + + + + + +v+ + + + + + |
324 | | * | | | | | |#|#|#|#| | | | |
325 | | * + + + + + + +v+ +v+v+ + + + |
326 | | * | | | | | |#|#|#|#| | | | |
327 | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+ |
328 | | * | |# |#| | |#| | |#|#|#| |
329 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
330 | | * | |# |#| | |#| | |#|#|#| |
331 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
332 | | * | |#>#|#| | |#| | |#|#|#| |
333 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
334 | | * | |#>#|#| | |#| | |#|#|#| |
335 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
336 | | * | |#>#|#| |#| | | |#|#|#| |
337 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
338 | | * | |# |#| | |#| | |#|#|#| |
339 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
340 | | * | |# |#| | |#| | |#|#|#| |
341 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
342 | | * | |# |#| | |#| | |#|#|#| |
343 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
344 | | * | |# |#| | |#| | |#|#|#| |
345 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
346 | | * |# |#|#|#|#|#|#|#|#|#| |
347 | | * + +-+-+-+-+-+-+-+-+-+-+-+-+ |
348 | | * |
349 | | * This shows where lines are detected ( - and | ), |
350 | | * where they are crossed ( > and v) and where cells |
351 | | * are full ( # ). |
352 | | * |
353 | | * Step 4: Row and column merging. |
354 | | * |
355 | | * Based on the information above, we then try to merge |
356 | | * cells and columns to simplify the table. |
357 | | * |
358 | | * The best rules I've come up with this so far are: |
359 | | * We can merge two adjacent columns if all the pairs of |
360 | | * cells in the two columns are mergeable. |
361 | | * |
362 | | * Cells are held to be mergable or not based upon the following |
363 | | * rules: |
364 | | * If there is a line between 2 cells - not mergeable. |
365 | | * else if the uncertainty between 2 cells is 0 - not mergeable. |
366 | | * else if the line between the 2 cells is crossed - mergeable. |
367 | | * else if strictly one of the cells is full - mergeable. |
368 | | * else not mergeable. |
369 | | * |
370 | | * So in the above example, column 2 (numbered from 0) can be merged |
371 | | * with column 3. |
372 | | * |
373 | | * This gives: |
374 | | * |
375 | | * +-+-+-+-+-+-+-+-+-+-+-+-+ |
376 | | * | | | | | |#|#|#|#| | | | |
377 | | * + + + + + +v+ +v+v+ + + + |
378 | | * | | | | | |#|#|#|#| | | | |
379 | | * + + + + + + +v+ + + + + + |
380 | | * | |#| | |#|#|#|#|#|#|#|#| |
381 | | * + +v+ + +v+v+ +v+v+v+v+v+ |
382 | | * | |#| |#|#|#|#|#|#|#|#|#| |
383 | | * + + + +v+ + +v+ + + + + + |
384 | | * |#|#|#|#|#|#|#|#|#|#|#|#| |
385 | | * +v+v+v+ +v+v+ +v+v+v+v+v+ |
386 | | * |#|#|#|#|#|#|#|#|#|#|#|#| |
387 | | * + + + +v+ + +v+ + + + + + |
388 | | * | |#| |#|#|#|#|#|#|#|#|#| |
389 | | * + +v+ + +v+v+ +v+v+v+v+v+ |
390 | | * | |#| | |#|#|#|#|#|#|#|#| |
391 | | * + + + + + + +v+ + + + + + |
392 | | * | | | | | |#|#|#|#| | | | |
393 | | * + + + + + +v+ +v+v+ + + + |
394 | | * | | | | | |#|#|#|#| | | | |
395 | | * +-+-+-+-+-+-+-+-+-+-+-+-+ |
396 | | * | |#|#| | |#| | |#|#|#| |
397 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
398 | | * | |#|#| | |#| | |#|#|#| |
399 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
400 | | * | |#|#| | |#| | |#|#|#| |
401 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
402 | | * | |#|#| | |#| | |#|#|#| |
403 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
404 | | * | |#|#| |#| | | |#|#|#| |
405 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
406 | | * | |#|#| | |#| | |#|#|#| |
407 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
408 | | * | |#|#| | |#| | |#|#|#| |
409 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
410 | | * | |#|#| | |#| | |#|#|#| |
411 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
412 | | * | |#|#| | |#| | |#|#|#| |
413 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
414 | | * |# |#|#|#|#|#|#|#|#|#| |
415 | | * + +-+-+-+-+-+-+-+-+-+-+-+ |
416 | | * |
417 | | * We then perform the same merging process for rows as for |
418 | | * columns - though there are no rows in the above example |
419 | | * that can be merged. |
420 | | * |
421 | | * You'll note that, for example, we don't merge row 0 and |
422 | | * row 1 in the above, because we have a pair of cells that |
423 | | * are both full without crossing. |
424 | | * |
425 | | * Step 5: Cell spanning |
426 | | * |
427 | | * Now we actually start to output the table. We keep a 'sent_table' |
428 | | * (a grid of W x H bools) to keep track of whether we've output |
429 | | * the content for a given cell or not yet. |
430 | | * |
431 | | * For each cell we reach, assuming sent_table[x,y] is false, |
432 | | * we merge it with as many cells on the right as required, |
433 | | * according to 'v_crossed' values (subject to not passing |
434 | | * v_lines or uncertainty == 0's). |
435 | | * |
436 | | * We then try to merge cells below according to 'h_crossed' |
437 | | * values (subject to not passing h_lines or uncertainty == 0's). |
438 | | * |
439 | | * In theory this can leave us with some cases where we'd like |
440 | | * to merge some cells (because of crossed) and can't (because |
441 | | * of lines or sent_table[]) values. In the absence of better |
442 | | * cell spanning algorithms we have no choice here. |
443 | | * |
444 | | * Then we output the contents and set sent_table[] values as |
445 | | * appropriate. |
446 | | * |
447 | | * If a row has no cells in it, we currently omit the TR. If/when |
448 | | * we figure out how to indicate rowspan/colspan in stext, we can |
449 | | * revisit that. |
450 | | */ |
451 | | |
452 | | |
453 | | static fz_stext_block * |
454 | | add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block) |
455 | 0 | { |
456 | 0 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block)); |
457 | 0 | memset(block, 0, sizeof(*block)); |
458 | 0 | block->type = FZ_STEXT_BLOCK_GRID; |
459 | 0 | block->bbox = fz_empty_rect; /* Fixes bug 703267. */ |
460 | 0 | block->next = *first_block; |
461 | 0 | if (*first_block) |
462 | 0 | { |
463 | 0 | (*first_block)->prev = block; |
464 | 0 | assert(*last_block); |
465 | 0 | } |
466 | 0 | else |
467 | 0 | { |
468 | 0 | assert(*last_block == NULL); |
469 | 0 | *last_block = block; |
470 | 0 | } |
471 | 0 | *first_block = block; |
472 | 0 | return block; |
473 | 0 | } |
474 | | |
475 | | static void |
476 | | insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest) |
477 | 0 | { |
478 | 0 | if (before) |
479 | 0 | { |
480 | | /* We have a block to insert it before, so we know it's not the last. */ |
481 | 0 | assert(dest->first_block != NULL && dest->last_block != NULL); |
482 | 0 | block->next = before; |
483 | 0 | block->prev = before->prev; |
484 | 0 | if (before->prev) |
485 | 0 | { |
486 | 0 | assert(before->prev->next == before); |
487 | 0 | before->prev->next = block; |
488 | 0 | } |
489 | 0 | else if (dest) |
490 | 0 | { |
491 | 0 | assert(dest->first_block == before); |
492 | 0 | dest->first_block = block; |
493 | 0 | } |
494 | 0 | else |
495 | 0 | { |
496 | 0 | assert(page->first_block == before); |
497 | 0 | page->first_block = block; |
498 | 0 | } |
499 | 0 | before->prev = block; |
500 | 0 | } |
501 | 0 | else if (dest) |
502 | 0 | { |
503 | | /* Will be the last block. */ |
504 | 0 | block->next = NULL; |
505 | 0 | block->prev = dest->last_block; |
506 | 0 | if (dest->last_block) |
507 | 0 | { |
508 | 0 | assert(dest->last_block->next == NULL); |
509 | 0 | dest->last_block->next = block; |
510 | 0 | } |
511 | 0 | if (dest->first_block == NULL) |
512 | 0 | dest->first_block = block; |
513 | 0 | dest->last_block = block; |
514 | 0 | } |
515 | 0 | else |
516 | 0 | { |
517 | | /* Will be the last block. */ |
518 | 0 | block->next = NULL; |
519 | 0 | block->prev = page->last_block; |
520 | 0 | if (page->last_block) |
521 | 0 | { |
522 | 0 | assert(page->last_block->next == NULL); |
523 | 0 | page->last_block->next = block; |
524 | 0 | } |
525 | 0 | if (page->first_block == NULL) |
526 | 0 | page->first_block = block; |
527 | 0 | page->last_block = block; |
528 | 0 | } |
529 | 0 | } |
530 | | |
531 | | static fz_stext_struct * |
532 | | add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw) |
533 | 0 | { |
534 | 0 | fz_stext_block *block; |
535 | 0 | int idx = 0; |
536 | 0 | size_t z; |
537 | 0 | fz_stext_struct *newstruct; |
538 | |
|
539 | 0 | if (raw == NULL) |
540 | 0 | raw = ""; |
541 | 0 | z = strlen(raw); |
542 | | |
543 | | /* We're going to insert a struct block. We need an idx, so walk the list */ |
544 | 0 | for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next) |
545 | 0 | { |
546 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
547 | 0 | { |
548 | 0 | assert(block->u.s.index >= idx); |
549 | 0 | idx = block->u.s.index + 1; |
550 | 0 | } |
551 | 0 | } |
552 | | /* So we'll add our block as idx. But all the other struct blocks that follow us need to have |
553 | | * larger values. */ |
554 | | |
555 | | /* Update all the subsequent structs to have a higher idx */ |
556 | 0 | if (before) |
557 | 0 | { |
558 | 0 | int idx2 = idx+1; |
559 | 0 | for (block = before->next; block != NULL; block = block->next) |
560 | 0 | { |
561 | 0 | if (block->type != FZ_STEXT_BLOCK_STRUCT) |
562 | 0 | continue; |
563 | 0 | if (block->u.s.index > idx2) |
564 | 0 | break; |
565 | 0 | block->u.s.index = idx2++; |
566 | 0 | } |
567 | 0 | } |
568 | | |
569 | | /* Now make our new struct block and insert it. */ |
570 | 0 | block = fz_pool_alloc(ctx, page->pool, sizeof(*block)); |
571 | 0 | block->type = FZ_STEXT_BLOCK_STRUCT; |
572 | 0 | block->bbox = fz_empty_rect; /* Fixes bug 703267. */ |
573 | 0 | insert_block_before(block, before, page, parent); |
574 | |
|
575 | 0 | block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z); |
576 | 0 | block->u.s.index = idx; |
577 | 0 | newstruct->parent = parent; |
578 | 0 | newstruct->standard = std; |
579 | 0 | memcpy(newstruct->raw, raw, z); |
580 | 0 | newstruct->raw[z] = 0; |
581 | 0 | newstruct->up = block; |
582 | |
|
583 | 0 | return newstruct; |
584 | 0 | } |
585 | | |
586 | | typedef struct |
587 | | { |
588 | | int len; |
589 | | int max; |
590 | | struct { |
591 | | int left; |
592 | | float pos; |
593 | | int freq; |
594 | | } *list; |
595 | | } div_list; |
596 | | |
597 | | static void |
598 | | div_list_push(fz_context *ctx, div_list *div, int left, float pos) |
599 | 0 | { |
600 | 0 | int i; |
601 | | |
602 | | /* FIXME: Could be bsearch. */ |
603 | 0 | for (i = 0; i < div->len; i++) |
604 | 0 | { |
605 | 0 | if (div->list[i].pos > pos) |
606 | 0 | break; |
607 | 0 | else if (div->list[i].pos == pos && div->list[i].left == left) |
608 | 0 | { |
609 | 0 | div->list[i].freq++; |
610 | 0 | return; |
611 | 0 | } |
612 | 0 | } |
613 | | |
614 | 0 | if (div->len == div->max) |
615 | 0 | { |
616 | 0 | int newmax = div->max * 2; |
617 | 0 | if (newmax == 0) |
618 | 0 | newmax = 32; |
619 | 0 | div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax); |
620 | 0 | div->max = newmax; |
621 | 0 | } |
622 | |
|
623 | 0 | if (i < div->len) |
624 | 0 | memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i)); |
625 | 0 | div->len++; |
626 | 0 | div->list[i].left = left; |
627 | 0 | div->list[i].pos = pos; |
628 | 0 | div->list[i].freq = 1; |
629 | 0 | } |
630 | | |
631 | | static fz_stext_grid_positions * |
632 | | make_table_positions(fz_context *ctx, div_list *xs, float min, float max) |
633 | 0 | { |
634 | 0 | int wind; |
635 | 0 | fz_stext_grid_positions *pos; |
636 | 0 | int len = xs->len; |
637 | 0 | int i; |
638 | 0 | int hi = 0; |
639 | | |
640 | | /* Count the number of edges */ |
641 | 0 | int local_min = 0; |
642 | 0 | int edges = 2; |
643 | |
|
644 | 0 | if (len == 0) |
645 | 0 | return NULL; |
646 | | |
647 | 0 | assert(xs->list[0].left); |
648 | 0 | for (i = 0; i < len; i++) |
649 | 0 | { |
650 | 0 | if (xs->list[i].left) |
651 | 0 | { |
652 | 0 | if (local_min) |
653 | 0 | edges++; |
654 | 0 | } |
655 | 0 | else |
656 | 0 | local_min = 1; |
657 | 0 | } |
658 | 0 | assert(!xs->list[i-1].left); |
659 | | |
660 | 0 | pos = fz_calloc(ctx, 1, sizeof(*pos) + (edges-1) * sizeof(pos->list[0])); |
661 | 0 | pos->len = edges; |
662 | | |
663 | | /* Copy the edges in */ |
664 | 0 | wind = 0; |
665 | 0 | local_min = 0; |
666 | 0 | edges = 1; |
667 | 0 | pos->list[0].pos = xs->list[0].pos; |
668 | 0 | pos->list[0].min = min; |
669 | 0 | pos->list[0].max = pos->list[0].pos; |
670 | 0 | pos->list[0].uncertainty = 0; |
671 | | #ifdef DEBUG_TABLE_HUNT |
672 | | printf("|%g ", pos->list[0].pos); |
673 | | #endif |
674 | 0 | for (i = 0; i < len; i++) |
675 | 0 | { |
676 | 0 | if (xs->list[i].left) |
677 | 0 | { |
678 | 0 | if (local_min) |
679 | 0 | { |
680 | 0 | pos->list[edges].min = xs->list[i-1].pos; |
681 | 0 | pos->list[edges].max = xs->list[i].pos; |
682 | 0 | pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2; |
683 | 0 | pos->list[edges++].uncertainty = wind; |
684 | | #ifdef DEBUG_TABLE_HUNT |
685 | | if (wind) |
686 | | printf("?%g(%d) ", pos->list[edges].pos, wind); |
687 | | else |
688 | | printf("|%g ", pos->list[edges].pos); |
689 | | #endif |
690 | 0 | } |
691 | 0 | wind += xs->list[i].freq; |
692 | 0 | if (wind > hi) |
693 | 0 | hi = wind; |
694 | 0 | } |
695 | 0 | else |
696 | 0 | { |
697 | 0 | wind -= xs->list[i].freq; |
698 | 0 | local_min = 1; |
699 | 0 | } |
700 | 0 | } |
701 | 0 | assert(wind == 0); |
702 | 0 | pos->list[edges].pos = xs->list[i-1].pos; |
703 | 0 | pos->list[edges].min = xs->list[i-1].pos; |
704 | 0 | pos->list[edges].max = max; |
705 | 0 | pos->list[edges].uncertainty = 0; |
706 | 0 | pos->max_uncertainty = hi; |
707 | | #ifdef DEBUG_TABLE_HUNT |
708 | | printf("|%g\n", pos->list[edges].pos); |
709 | | #endif |
710 | |
|
711 | 0 | return pos; |
712 | 0 | } |
713 | | |
714 | | static fz_stext_grid_positions * |
715 | | clone_grid_positions(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs) |
716 | 0 | { |
717 | 0 | size_t z = sizeof(*xs) + (xs->len-1) * sizeof(xs->list[0]); |
718 | 0 | fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z); |
719 | |
|
720 | 0 | memcpy(xs2, xs, z); |
721 | |
|
722 | 0 | return xs2; |
723 | 0 | } |
724 | | |
725 | | static void |
726 | | sanitize_positions(fz_context *ctx, div_list *xs) |
727 | 0 | { |
728 | 0 | int i, j; |
729 | |
|
730 | | #ifdef DEBUG_TABLE_HUNT |
731 | | printf("OK:\n"); |
732 | | for (i = 0; i < xs->len; i++) |
733 | | { |
734 | | if (xs->list[i].left) |
735 | | printf("["); |
736 | | printf("%g(%d)", xs->list[i].pos, xs->list[i].freq); |
737 | | if (!xs->list[i].left) |
738 | | printf("]"); |
739 | | printf(" "); |
740 | | } |
741 | | printf("\n"); |
742 | | #endif |
743 | | |
744 | | /* Now, combine runs of left and right */ |
745 | 0 | for (i = 0; i < xs->len; i++) |
746 | 0 | { |
747 | 0 | if (xs->list[i].left) |
748 | 0 | { |
749 | 0 | j = i; |
750 | 0 | while (i < xs->len-1 && xs->list[i+1].left) |
751 | 0 | { |
752 | 0 | i++; |
753 | 0 | xs->list[j].freq += xs->list[i].freq; |
754 | 0 | xs->list[i].freq = 0; |
755 | 0 | } |
756 | 0 | } |
757 | 0 | else |
758 | 0 | { |
759 | 0 | while (i < xs->len-1 && !xs->list[i+1].left) |
760 | 0 | { |
761 | 0 | i++; |
762 | 0 | xs->list[i].freq += xs->list[i-1].freq; |
763 | 0 | xs->list[i-1].freq = 0; |
764 | 0 | } |
765 | 0 | } |
766 | 0 | } |
767 | |
|
768 | | #ifdef DEBUG_TABLE_HUNT |
769 | | printf("Shrunk:\n"); |
770 | | for (i = 0; i < xs->len; i++) |
771 | | { |
772 | | if (xs->list[i].left) |
773 | | printf("["); |
774 | | printf("%g(%d)", xs->list[i].pos, xs->list[i].freq); |
775 | | if (!xs->list[i].left) |
776 | | printf("]"); |
777 | | printf(" "); |
778 | | } |
779 | | printf("\n"); |
780 | | #endif |
781 | | |
782 | | /* Now remove the 0 frequency ones. */ |
783 | 0 | j = 0; |
784 | 0 | for (i = 0; i < xs->len; i++) |
785 | 0 | { |
786 | 0 | if (xs->list[i].freq == 0) |
787 | 0 | continue; |
788 | 0 | if (i != j) |
789 | 0 | xs->list[j] = xs->list[i]; |
790 | 0 | j++; |
791 | 0 | } |
792 | 0 | xs->len = j; |
793 | |
|
794 | | #ifdef DEBUG_TABLE_HUNT |
795 | | printf("Compacted:\n"); |
796 | | for (i = 0; i < xs->len; i++) |
797 | | { |
798 | | if (xs->list[i].left) |
799 | | printf("["); |
800 | | printf("%g(%d)", xs->list[i].pos, xs->list[i].freq); |
801 | | if (!xs->list[i].left) |
802 | | printf("]"); |
803 | | printf(" "); |
804 | | } |
805 | | printf("\n"); |
806 | | #endif |
807 | 0 | } |
808 | | |
809 | | static int |
810 | | all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block) |
811 | 0 | { |
812 | 0 | for (; block != NULL; block = block->next) |
813 | 0 | { |
814 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
815 | 0 | { |
816 | 0 | if (block->u.s.down == NULL) |
817 | 0 | continue; |
818 | 0 | if (block->u.s.down->standard == FZ_STRUCTURE_H || |
819 | 0 | block->u.s.down->standard == FZ_STRUCTURE_H1 || |
820 | 0 | block->u.s.down->standard == FZ_STRUCTURE_H2 || |
821 | 0 | block->u.s.down->standard == FZ_STRUCTURE_H3 || |
822 | 0 | block->u.s.down->standard == FZ_STRUCTURE_H4 || |
823 | 0 | block->u.s.down->standard == FZ_STRUCTURE_H5 || |
824 | 0 | block->u.s.down->standard == FZ_STRUCTURE_H6) |
825 | 0 | continue; |
826 | 0 | if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block)) |
827 | 0 | return 0; |
828 | 0 | } |
829 | 0 | if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL) |
830 | 0 | return 0; |
831 | 0 | } |
832 | | |
833 | 0 | return 1; |
834 | 0 | } |
835 | | |
836 | | static void |
837 | | walk_blocks(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, int descend) |
838 | 0 | { |
839 | 0 | fz_stext_block *block; |
840 | 0 | fz_stext_line *line; |
841 | 0 | fz_stext_char *ch; |
842 | |
|
843 | 0 | for (block = first_block; block != NULL; block = block->next) |
844 | 0 | { |
845 | 0 | switch (block->type) |
846 | 0 | { |
847 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
848 | | /* Don't descend into */ |
849 | 0 | if (descend && block->u.s.down && !all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block)) |
850 | 0 | walk_blocks(ctx, xs, ys, block->u.s.down->first_block, descend); |
851 | 0 | break; |
852 | 0 | case FZ_STEXT_BLOCK_VECTOR: |
853 | 0 | break; |
854 | 0 | case FZ_STEXT_BLOCK_TEXT: |
855 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
856 | 0 | { |
857 | 0 | float rpos; |
858 | 0 | int left = 1; |
859 | 0 | int right = 0; |
860 | 0 | int non_empty = 0; |
861 | 0 | for (ch = line->first_char; ch != NULL; ch = ch->next) |
862 | 0 | { |
863 | 0 | if (ch->c == ' ') |
864 | 0 | { |
865 | 0 | if (ch->next == NULL) |
866 | 0 | { |
867 | | /* This is a trailing space. We've seen cases where we get |
868 | | * trailing spaces on cell contents and this screws stuff |
869 | | * up (e.g. dotted-gridlines-tables.pdf). */ |
870 | 0 | if (right) |
871 | 0 | { |
872 | | /* Send a 'right' as the left position of this space. */ |
873 | 0 | float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); |
874 | 0 | div_list_push(ctx, xs, 0, lpos); |
875 | 0 | left = 1; |
876 | 0 | right = 0; |
877 | 0 | } |
878 | 0 | } |
879 | 0 | else if (ch->next->c == ' ') |
880 | 0 | { |
881 | | /* Run of multiple spaces. Send a 'right' as the left position |
882 | | * of this space, and then skip forwards. */ |
883 | 0 | if (right) |
884 | 0 | { |
885 | 0 | float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); |
886 | 0 | div_list_push(ctx, xs, 0, lpos); |
887 | 0 | while (ch->next && ch->next->c == ' ') |
888 | 0 | ch = ch->next; |
889 | 0 | left = 1; |
890 | 0 | right = 0; |
891 | 0 | } |
892 | 0 | } |
893 | 0 | else |
894 | 0 | { |
895 | | /* Ignore any other spaces. Don't start or end a run on them. */ |
896 | 0 | } |
897 | 0 | } |
898 | 0 | else |
899 | 0 | { |
900 | 0 | non_empty = 1; |
901 | 0 | if (left) |
902 | 0 | { |
903 | 0 | float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); |
904 | 0 | div_list_push(ctx, xs, 1, lpos); |
905 | 0 | left = 0; |
906 | 0 | } |
907 | 0 | rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x); |
908 | 0 | right = 1; |
909 | 0 | } |
910 | 0 | } |
911 | 0 | if (non_empty) |
912 | 0 | { |
913 | 0 | div_list_push(ctx, ys, 1, line->bbox.y0); |
914 | 0 | div_list_push(ctx, ys, 0, line->bbox.y1); |
915 | 0 | } |
916 | 0 | if (right) |
917 | 0 | div_list_push(ctx, xs, 0, rpos); |
918 | 0 | } |
919 | 0 | break; |
920 | 0 | } |
921 | 0 | } |
922 | 0 | } |
923 | | |
924 | | /* One of our datastructures (cells_t) is an array of details about the |
925 | | * cells that make up our table. It's a w * h array of cell_t's. Each |
926 | | * cell contains data on one of the cells in the table, as you'd expect. |
927 | | * |
928 | | * . . |
929 | | * . . |
930 | | * - - +-------+ - - |
931 | | * | . |
932 | | * | . |
933 | | * | . |
934 | | * - - + - - - + - - |
935 | | * . . |
936 | | * . . |
937 | | * |
938 | | * For any given cell, we store details about the top (lowest y coord) |
939 | | * and left (lowest x coord) edges. Specifically we store whether |
940 | | * there is a line at this position (h_line and v_line) (i.e. a drawn |
941 | | * border), and we also store whether content crosses this edge (h_crossed |
942 | | * and y_crossed). Finally, we store whether the cell has any content |
943 | | * in it at all (full). |
944 | | * |
945 | | * A table which has w positions across and h positions vertically, will |
946 | | * only really have (w-1) * (h-1) cells. We store w*h though to allow for |
947 | | * the right and bottom edges to have their lines represented. |
948 | | */ |
949 | | |
950 | | typedef struct |
951 | | { |
952 | | int h_line; |
953 | | int v_line; |
954 | | int h_crossed; |
955 | | int v_crossed; |
956 | | int full; |
957 | | } cell_t; |
958 | | |
959 | | typedef struct |
960 | | { |
961 | | int w; |
962 | | int h; |
963 | | cell_t cell[1]; |
964 | | } cells_t; |
965 | | |
966 | | typedef struct |
967 | | { |
968 | | cells_t *cells; |
969 | | fz_stext_grid_positions *xpos; |
970 | | fz_stext_grid_positions *ypos; |
971 | | } grid_walker_data; |
972 | | |
973 | | static cell_t * |
974 | | get_cell(cells_t *cells, int x, int y) |
975 | 0 | { |
976 | 0 | return &cells->cell[x + y * cells->w]; |
977 | 0 | } |
978 | | |
979 | | static int |
980 | | find_grid_pos_with_reinforcement(fz_context *ctx, fz_stext_grid_positions *pos, float x, int expand) |
981 | 0 | { |
982 | 0 | int i; |
983 | |
|
984 | 0 | for (i = 0; i < pos->len; i++) |
985 | 0 | { |
986 | 0 | int r; |
987 | 0 | if (x > pos->list[i].max) |
988 | 0 | continue; |
989 | 0 | if (x < pos->list[i].min) |
990 | 0 | { |
991 | 0 | if (expand && i > 0) |
992 | 0 | { |
993 | 0 | float mid = (pos->list[i].min + pos->list[i-1].max)/2; |
994 | 0 | if (x < mid) |
995 | 0 | return i-1; |
996 | 0 | else |
997 | 0 | return i; |
998 | 0 | } |
999 | 0 | return -1; |
1000 | 0 | } |
1001 | 0 | r = pos->list[i].reinforcement++; |
1002 | 0 | pos->list[i].pos = (pos->list[i].pos * r + x) / (r+1); |
1003 | 0 | return i; |
1004 | 0 | } |
1005 | | |
1006 | 0 | return -1; |
1007 | 0 | } |
1008 | | |
1009 | | static int |
1010 | | find_cell_l(fz_stext_grid_positions *pos, float x) |
1011 | 0 | { |
1012 | 0 | int i; |
1013 | |
|
1014 | 0 | for (i = 0; i < pos->len; i++) |
1015 | 0 | if (x < pos->list[i].pos) |
1016 | 0 | return i-1; |
1017 | | |
1018 | 0 | return -1; |
1019 | 0 | } |
1020 | | |
1021 | | static int |
1022 | | find_cell_r(fz_stext_grid_positions *pos, float x) |
1023 | 0 | { |
1024 | 0 | int i; |
1025 | |
|
1026 | 0 | for (i = 0; i < pos->len; i++) |
1027 | 0 | if (x <= pos->list[i].pos) |
1028 | 0 | return i-1; |
1029 | | |
1030 | 0 | return -1; |
1031 | 0 | } |
1032 | | |
1033 | | static int |
1034 | | add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1) |
1035 | 0 | { |
1036 | 0 | int start = find_grid_pos_with_reinforcement(ctx, gd->xpos, x0, 1); |
1037 | 0 | int end = find_grid_pos_with_reinforcement(ctx, gd->xpos, x1, 1); |
1038 | 0 | float y = (y0 + y1) / 2; |
1039 | 0 | int yidx = find_grid_pos_with_reinforcement(ctx, gd->ypos, y, 0); |
1040 | 0 | int i; |
1041 | |
|
1042 | 0 | if (start < 0 || end < 0 || yidx < 0 || start >= end) |
1043 | 0 | return 1; |
1044 | | |
1045 | 0 | for (i = start; i < end; i++) |
1046 | 0 | get_cell(gd->cells, i, yidx)->h_line++; |
1047 | |
|
1048 | 0 | return 0; |
1049 | 0 | } |
1050 | | |
1051 | | static int |
1052 | | add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1) |
1053 | 0 | { |
1054 | 0 | int start = find_grid_pos_with_reinforcement(ctx, gd->ypos, y0, 1); |
1055 | 0 | int end = find_grid_pos_with_reinforcement(ctx, gd->ypos, y1, 1); |
1056 | 0 | float x = (x0 + x1) / 2; |
1057 | 0 | int xidx = find_grid_pos_with_reinforcement(ctx, gd->xpos, x, 0); |
1058 | 0 | int i; |
1059 | |
|
1060 | 0 | if (start < 0 || end < 0 || xidx < 0 || start >= end) |
1061 | 0 | return 1; |
1062 | | |
1063 | 0 | for (i = start; i < end; i++) |
1064 | 0 | get_cell(gd->cells, xidx, i)->v_line++; |
1065 | |
|
1066 | 0 | return 0; |
1067 | 0 | } |
1068 | | |
1069 | | /* Shared internal routine with stext-boxer.c */ |
1070 | | fz_rect fz_collate_small_vector_run(fz_stext_block **blockp); |
1071 | | |
1072 | | static void |
1073 | | walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) |
1074 | 0 | { |
1075 | 0 | for (; block != NULL; block = block->next) |
1076 | 0 | { |
1077 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
1078 | 0 | { |
1079 | 0 | if (block->u.s.down) |
1080 | 0 | walk_grid_lines(ctx, gd, block->u.s.down->first_block); |
1081 | 0 | continue; |
1082 | 0 | } |
1083 | 0 | else if (block->type == FZ_STEXT_BLOCK_VECTOR) |
1084 | 0 | { |
1085 | 0 | fz_rect r = block->bbox; |
1086 | 0 | float w = r.x1 - r.x0; |
1087 | 0 | float h = r.y1 - r.y0; |
1088 | 0 | int failed = 0; |
1089 | 0 | if (w > h && h < 1) |
1090 | 0 | { |
1091 | | /* Thin, wide line */ |
1092 | 0 | failed = add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1); |
1093 | 0 | } |
1094 | 0 | else if (w < h && w < 1) |
1095 | 0 | { |
1096 | | /* Thin, wide line */ |
1097 | 0 | failed = add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1); |
1098 | 0 | } |
1099 | 0 | else |
1100 | 0 | { |
1101 | | /* Rectangle */ |
1102 | 0 | int failed2; |
1103 | 0 | failed2 = add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y0); |
1104 | 0 | failed2 |= add_h_line(ctx, gd, r.x0, r.x1, r.y1, r.y1); |
1105 | 0 | failed = add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x0); |
1106 | 0 | failed |= add_v_line(ctx, gd, r.y0, r.y1, r.x1, r.x1); |
1107 | 0 | failed &= failed2; |
1108 | 0 | } |
1109 | 0 | if (failed) |
1110 | 0 | { |
1111 | | /* Try merging multiple successive vectors to get better |
1112 | | * results. */ |
1113 | 0 | r = fz_collate_small_vector_run(&block); |
1114 | 0 | w = r.x1 - r.x0; |
1115 | 0 | h = r.y1 - r.y0; |
1116 | 0 | if (w > h) |
1117 | 0 | { |
1118 | | #ifdef DEBUG |
1119 | | if (add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1)) |
1120 | | #endif |
1121 | 0 | add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1); |
1122 | 0 | } |
1123 | 0 | else |
1124 | 0 | { |
1125 | | #ifdef DEBUG |
1126 | | if (add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1)) |
1127 | | #endif |
1128 | 0 | add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1); |
1129 | 0 | } |
1130 | 0 | } |
1131 | 0 | } |
1132 | 0 | } |
1133 | 0 | } |
1134 | | |
1135 | | static void |
1136 | | erase_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) |
1137 | 0 | { |
1138 | 0 | fz_rect bounds = { |
1139 | 0 | gd->xpos->list[0].pos, |
1140 | 0 | gd->ypos->list[0].pos, |
1141 | 0 | gd->xpos->list[gd->xpos->len-1].pos, |
1142 | 0 | gd->ypos->list[gd->ypos->len-1].pos }; |
1143 | |
|
1144 | 0 | for (; block != NULL; block = block->next) |
1145 | 0 | { |
1146 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
1147 | 0 | { |
1148 | 0 | if (block->u.s.down) |
1149 | 0 | erase_grid_lines(ctx, gd, block->u.s.down->first_block); |
1150 | 0 | continue; |
1151 | 0 | } |
1152 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
1153 | 0 | { |
1154 | 0 | fz_stext_line *line; |
1155 | |
|
1156 | 0 | if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 || |
1157 | 0 | block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0) |
1158 | 0 | continue; |
1159 | | |
1160 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
1161 | 0 | { |
1162 | 0 | fz_stext_char *ch = line->first_char; |
1163 | | |
1164 | | /* Skip leading spaces */ |
1165 | 0 | while (ch != NULL && ch->c == ' ') |
1166 | 0 | ch = ch->next; |
1167 | |
|
1168 | 0 | for (; ch != NULL; ch = ch->next) |
1169 | 0 | { |
1170 | 0 | fz_rect r; |
1171 | 0 | int x, y, x0, x1, y0, y1; |
1172 | |
|
1173 | 0 | if (ch->c == 32) |
1174 | 0 | { |
1175 | | /* Trailing space, skip it. */ |
1176 | 0 | if (ch->next == NULL) |
1177 | 0 | break; |
1178 | 0 | if (ch->next->c == 32) |
1179 | 0 | { |
1180 | | /* Run of spaces. Skip 'em. */ |
1181 | 0 | while (ch->next && ch->next->c == 32) |
1182 | 0 | ch = ch->next; |
1183 | 0 | continue; |
1184 | 0 | } |
1185 | | /* A single space. Accept it. */ |
1186 | 0 | } |
1187 | 0 | r = fz_rect_from_quad(ch->quad); |
1188 | 0 | x0 = find_cell_l(gd->xpos, r.x0); |
1189 | 0 | x1 = find_cell_r(gd->xpos, r.x1); |
1190 | 0 | y0 = find_cell_l(gd->ypos, r.y0); |
1191 | 0 | y1 = find_cell_r(gd->ypos, r.y1); |
1192 | 0 | if (x0 < 0 || x1 <0 || y0 < 0 || y1 < 0) |
1193 | 0 | continue; |
1194 | 0 | if (x0 < x1) |
1195 | 0 | { |
1196 | 0 | for (y = y0; y <= y1; y++) |
1197 | 0 | for (x = x0; x < x1; x++) |
1198 | 0 | get_cell(gd->cells, x+1, y)->v_crossed++; |
1199 | 0 | } |
1200 | 0 | if (y0 < y1) |
1201 | 0 | { |
1202 | 0 | for (y = y0; y < y1; y++) |
1203 | 0 | for (x = x0; x <= x1; x++) |
1204 | 0 | get_cell(gd->cells, x, y+1)->h_crossed++; |
1205 | 0 | } |
1206 | 0 | for (y = y0; y <= y1; y++) |
1207 | 0 | for (x = x0; x <= x1; x++) |
1208 | 0 | get_cell(gd->cells, x, y)->full++; |
1209 | 0 | } |
1210 | 0 | } |
1211 | 0 | } |
1212 | 0 | } |
1213 | 0 | } |
1214 | | |
1215 | | static cells_t *new_cells(fz_context *ctx, int w, int h) |
1216 | 0 | { |
1217 | 0 | cells_t *cells = fz_calloc(ctx, 1, sizeof(cells_t) + sizeof(cells->cell[0]) * (w * h - 1)); |
1218 | 0 | cells->w = w; |
1219 | 0 | cells->h = h; |
1220 | |
|
1221 | 0 | return cells; |
1222 | 0 | } |
1223 | | |
1224 | | #ifdef DEBUG_TABLE_STRUCTURE |
1225 | | static void |
1226 | | asciiart_table(grid_walker_data *gd) |
1227 | | { |
1228 | | int w = gd->xpos->len; |
1229 | | int h = gd->ypos->len; |
1230 | | int x, y; |
1231 | | |
1232 | | for (y = 0; y < h; y++) |
1233 | | { |
1234 | | for (x = 0; x < w-1; x++) |
1235 | | { |
1236 | | cell_t *cell = get_cell(gd->cells, x, y); |
1237 | | int line = cell->h_line; |
1238 | | int erase = cell->h_crossed; |
1239 | | printf("+"); |
1240 | | if (line && !erase) |
1241 | | { |
1242 | | printf("-"); |
1243 | | } |
1244 | | else if (!line && erase) |
1245 | | { |
1246 | | printf("v"); |
1247 | | } |
1248 | | else if (line && erase) |
1249 | | { |
1250 | | printf("*"); |
1251 | | } |
1252 | | else |
1253 | | { |
1254 | | printf(" "); |
1255 | | } |
1256 | | } |
1257 | | printf("+\n"); |
1258 | | if (y == h-1) |
1259 | | break; |
1260 | | for (x = 0; x < w; x++) |
1261 | | { |
1262 | | cell_t *cell = get_cell(gd->cells, x, y); |
1263 | | int line = cell->v_line; |
1264 | | int erase = cell->v_crossed; |
1265 | | if (line && !erase) |
1266 | | { |
1267 | | printf("|"); |
1268 | | } |
1269 | | else if (!line && erase) |
1270 | | { |
1271 | | printf(">"); |
1272 | | } |
1273 | | else if (line && erase) |
1274 | | { |
1275 | | printf("*"); |
1276 | | } |
1277 | | else |
1278 | | { |
1279 | | printf(" "); |
1280 | | } |
1281 | | if (x < w-1) |
1282 | | { |
1283 | | if (cell->full) |
1284 | | printf("#"); |
1285 | | else |
1286 | | printf(" "); |
1287 | | } |
1288 | | else |
1289 | | printf("\n"); |
1290 | | } |
1291 | | } |
1292 | | } |
1293 | | #endif |
1294 | | |
1295 | | static void |
1296 | | recalc_bbox(fz_stext_block *block) |
1297 | 0 | { |
1298 | 0 | fz_rect bbox = fz_empty_rect; |
1299 | 0 | fz_stext_line *line; |
1300 | |
|
1301 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
1302 | 0 | bbox = fz_union_rect(bbox, line->bbox); |
1303 | |
|
1304 | 0 | block->bbox = bbox; |
1305 | 0 | } |
1306 | | |
1307 | | static void |
1308 | | unlink_line_from_block(fz_stext_line *line, fz_stext_block *block) |
1309 | 0 | { |
1310 | 0 | fz_stext_line *next_line = line->next; |
1311 | |
|
1312 | 0 | if (line->prev) |
1313 | 0 | { |
1314 | 0 | assert(line->prev->next == line); |
1315 | 0 | line->prev->next = next_line; |
1316 | 0 | } |
1317 | 0 | else |
1318 | 0 | { |
1319 | 0 | assert(block->u.t.first_line == line); |
1320 | 0 | block->u.t.first_line = next_line; |
1321 | 0 | } |
1322 | 0 | if (next_line) |
1323 | 0 | { |
1324 | 0 | assert(next_line->prev == line); |
1325 | 0 | next_line->prev = line->prev; |
1326 | 0 | } |
1327 | 0 | else |
1328 | 0 | { |
1329 | 0 | assert(block->u.t.last_line == line); |
1330 | 0 | block->u.t.last_line = line->prev; |
1331 | 0 | } |
1332 | 0 | } |
1333 | | |
1334 | | static void |
1335 | | append_line_to_block(fz_stext_line *line, fz_stext_block *block) |
1336 | 0 | { |
1337 | 0 | if (block->u.t.last_line == NULL) |
1338 | 0 | { |
1339 | 0 | assert(block->u.t.first_line == NULL); |
1340 | 0 | block->u.t.first_line = block->u.t.last_line = line; |
1341 | 0 | line->prev = NULL; |
1342 | 0 | } |
1343 | 0 | else |
1344 | 0 | { |
1345 | 0 | assert(block->u.t.last_line->next == NULL); |
1346 | 0 | line->prev = block->u.t.last_line; |
1347 | 0 | block->u.t.last_line->next = line; |
1348 | 0 | block->u.t.last_line = line; |
1349 | 0 | } |
1350 | 0 | line->next = NULL; |
1351 | 0 | } |
1352 | | |
1353 | | static void |
1354 | | unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last) |
1355 | 0 | { |
1356 | 0 | if (block->prev) |
1357 | 0 | { |
1358 | 0 | assert(block->prev->next == block); |
1359 | 0 | block->prev->next = block->next; |
1360 | 0 | } |
1361 | 0 | else |
1362 | 0 | { |
1363 | 0 | assert(*first == block); |
1364 | 0 | *first = block->next; |
1365 | 0 | } |
1366 | 0 | if (block->next) |
1367 | 0 | { |
1368 | 0 | assert(block->next->prev == block); |
1369 | 0 | block->next->prev = block->prev; |
1370 | 0 | } |
1371 | 0 | else |
1372 | 0 | { |
1373 | 0 | assert(*last == block); |
1374 | 0 | *last = block->prev; |
1375 | 0 | } |
1376 | 0 | } |
1377 | | |
1378 | | #ifndef NDEBUG |
1379 | | static int |
1380 | | verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src) |
1381 | 0 | { |
1382 | 0 | fz_stext_block *block; |
1383 | 0 | fz_stext_block **first = src ? &src->first_block : &page->first_block; |
1384 | 0 | fz_stext_block **last = src ? &src->last_block : &page->last_block; |
1385 | 0 | int max = 0; |
1386 | |
|
1387 | 0 | assert((*first == NULL) == (*last == NULL)); |
1388 | | |
1389 | 0 | for (block = *first; block != NULL; block = block->next) |
1390 | 0 | { |
1391 | 0 | fz_stext_line *line; |
1392 | |
|
1393 | 0 | if (block->prev == NULL) |
1394 | 0 | assert(*first == block); |
1395 | 0 | else |
1396 | 0 | assert(block->prev->next == block); |
1397 | 0 | if (block->next == NULL) |
1398 | 0 | assert(*last == block); |
1399 | 0 | else |
1400 | 0 | assert(block->next->prev == block); |
1401 | | |
1402 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
1403 | 0 | { |
1404 | 0 | if (block->u.s.down) |
1405 | 0 | { |
1406 | 0 | int m = verify_stext(ctx, page, block->u.s.down); |
1407 | 0 | if (m > max) |
1408 | 0 | max = m; |
1409 | 0 | } |
1410 | 0 | continue; |
1411 | 0 | } |
1412 | 0 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
1413 | 0 | continue; |
1414 | 0 | assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL)); |
1415 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
1416 | 0 | { |
1417 | 0 | fz_stext_char *ch; |
1418 | |
|
1419 | 0 | if (line->next == NULL) |
1420 | 0 | assert(block->u.t.last_line == line); |
1421 | 0 | else |
1422 | 0 | assert(line->next->prev == line); |
1423 | | |
1424 | 0 | assert((line->first_char == NULL) == (line->last_char == NULL)); |
1425 | | |
1426 | 0 | for (ch = line->first_char; ch != NULL; ch = ch->next) |
1427 | 0 | assert(ch->next != NULL || line->last_char == ch); |
1428 | 0 | } |
1429 | 0 | } |
1430 | | |
1431 | 0 | return max+1; |
1432 | 0 | } |
1433 | | #endif |
1434 | | |
1435 | | static fz_rect |
1436 | | move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r) |
1437 | 0 | { |
1438 | 0 | fz_stext_block *before = dest ? dest->first_block : page->first_block; |
1439 | 0 | fz_stext_block **sfirst = src ? &src->first_block : &page->first_block; |
1440 | 0 | fz_stext_block **slast = src ? &src->last_block : &page->last_block; |
1441 | 0 | fz_stext_block *block, *next_block; |
1442 | |
|
1443 | 0 | for (block = *sfirst; block != NULL; block = next_block) |
1444 | 0 | { |
1445 | 0 | fz_rect bbox = fz_intersect_rect(block->bbox, r); |
1446 | 0 | next_block = block->next; |
1447 | | /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ |
1448 | 0 | if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1) |
1449 | 0 | continue; /* Trivially excluded */ |
1450 | 0 | if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1) |
1451 | 0 | { |
1452 | | /* Trivially included */ |
1453 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) |
1454 | 0 | { |
1455 | 0 | if (block->u.s.down->standard == FZ_STRUCTURE_TD) |
1456 | 0 | { |
1457 | | /* Don't copy TDs! Just copy the contents */ |
1458 | 0 | move_contained_content(ctx, page, dest, block->u.s.down, r); |
1459 | 0 | continue; |
1460 | 0 | } |
1461 | 0 | } |
1462 | 0 | unlink_block(block, sfirst, slast); |
1463 | 0 | insert_block_before(block, before, page, dest); |
1464 | 0 | assert(before == block->next); |
1465 | 0 | continue; |
1466 | 0 | } |
1467 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
1468 | 0 | { |
1469 | 0 | if (block->u.s.down && !all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block)) |
1470 | 0 | move_contained_content(ctx, page, dest, block->u.s.down, r); |
1471 | 0 | } |
1472 | 0 | if (block->type == FZ_STEXT_BLOCK_TEXT) |
1473 | 0 | { |
1474 | | /* Partially included text block */ |
1475 | 0 | fz_stext_line *line, *next_line; |
1476 | 0 | fz_stext_block *newblock = NULL; |
1477 | |
|
1478 | 0 | for (line = block->u.t.first_line; line != NULL; line = next_line) |
1479 | 0 | { |
1480 | 0 | fz_rect lrect = fz_intersect_rect(line->bbox, r); |
1481 | 0 | next_line = line->next; |
1482 | | |
1483 | | /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ |
1484 | 0 | if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1) |
1485 | 0 | continue; /* Trivial exclusion */ |
1486 | 0 | if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1) |
1487 | 0 | { |
1488 | | /* Trivial inclusion */ |
1489 | 0 | if (newblock == NULL) |
1490 | 0 | { |
1491 | 0 | newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); |
1492 | 0 | insert_block_before(newblock, before, page, dest); |
1493 | 0 | assert(before == newblock->next); |
1494 | 0 | } |
1495 | | |
1496 | 0 | unlink_line_from_block(line, block); |
1497 | 0 | append_line_to_block(line, newblock); |
1498 | 0 | } |
1499 | 0 | else |
1500 | 0 | { |
1501 | | /* Need to walk the line and just take parts */ |
1502 | 0 | fz_stext_line *newline = NULL; |
1503 | 0 | fz_stext_char *ch, *next_ch, *prev_ch = NULL; |
1504 | |
|
1505 | 0 | for (ch = line->first_char; ch != NULL; ch = next_ch) |
1506 | 0 | { |
1507 | 0 | fz_rect crect = fz_rect_from_quad(ch->quad); |
1508 | 0 | float x = (crect.x0 + crect.x1)/2; |
1509 | 0 | float y = (crect.y0 + crect.y1)/2; |
1510 | 0 | next_ch = ch->next; |
1511 | 0 | if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y) |
1512 | 0 | { |
1513 | 0 | prev_ch = ch; |
1514 | 0 | continue; |
1515 | 0 | } |
1516 | | /* Take this char */ |
1517 | 0 | if (newline == NULL) |
1518 | 0 | { |
1519 | 0 | newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline)); |
1520 | 0 | newline->dir = line->dir; |
1521 | 0 | newline->wmode = line->wmode; |
1522 | 0 | newline->bbox = fz_empty_rect; |
1523 | 0 | } |
1524 | | /* Unlink char */ |
1525 | 0 | if (prev_ch == NULL) |
1526 | 0 | line->first_char = next_ch; |
1527 | 0 | else |
1528 | 0 | prev_ch->next = next_ch; |
1529 | 0 | if (next_ch == NULL) |
1530 | 0 | line->last_char = prev_ch; |
1531 | | /* Relink char */ |
1532 | 0 | ch->next = NULL; |
1533 | 0 | if (newline->last_char == NULL) |
1534 | 0 | newline->first_char = ch; |
1535 | 0 | else |
1536 | 0 | newline->last_char->next = ch; |
1537 | 0 | newline->last_char = ch; |
1538 | 0 | newline->bbox = fz_union_rect(newline->bbox, crect); |
1539 | 0 | } |
1540 | 0 | if (line->first_char == NULL) |
1541 | 0 | { |
1542 | | /* We've removed all the chars from this line. |
1543 | | * Better remove the line too. */ |
1544 | 0 | if (line->prev) |
1545 | 0 | line->prev->next = next_line; |
1546 | 0 | else |
1547 | 0 | block->u.t.first_line = next_line; |
1548 | 0 | if (next_line) |
1549 | 0 | next_line->prev = line->prev; |
1550 | 0 | else |
1551 | 0 | block->u.t.last_line = line->prev; |
1552 | 0 | } |
1553 | 0 | if (newline) |
1554 | 0 | { |
1555 | 0 | if (newblock == NULL) |
1556 | 0 | { |
1557 | 0 | newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); |
1558 | | |
1559 | | /* Add the block onto our target list */ |
1560 | 0 | insert_block_before(newblock, before, page, dest); |
1561 | 0 | before = newblock->next; |
1562 | 0 | } |
1563 | 0 | append_line_to_block(newline, newblock); |
1564 | 0 | } |
1565 | 0 | } |
1566 | 0 | } |
1567 | 0 | if (newblock) |
1568 | 0 | { |
1569 | 0 | recalc_bbox(block); |
1570 | 0 | recalc_bbox(newblock); |
1571 | 0 | } |
1572 | 0 | if (block->u.t.first_line == NULL) |
1573 | 0 | { |
1574 | | /* We've removed all the lines from the block. Should remove that too! */ |
1575 | 0 | if (block->prev) |
1576 | 0 | { |
1577 | 0 | assert(block->prev->next == block); |
1578 | 0 | block->prev->next = next_block; |
1579 | 0 | } |
1580 | 0 | else |
1581 | 0 | { |
1582 | 0 | assert(*sfirst == block); |
1583 | 0 | *sfirst = block->next; |
1584 | 0 | } |
1585 | 0 | if (next_block) |
1586 | 0 | { |
1587 | 0 | assert(next_block->prev == block); |
1588 | 0 | next_block->prev = block->prev; |
1589 | 0 | } |
1590 | 0 | else |
1591 | 0 | { |
1592 | 0 | assert(*slast == block); |
1593 | 0 | *slast = block->prev; |
1594 | 0 | } |
1595 | 0 | } |
1596 | 0 | } |
1597 | 0 | } |
1598 | | |
1599 | 0 | return r; |
1600 | 0 | } |
1601 | | |
1602 | | static fz_stext_block * |
1603 | | find_table_insertion_point(fz_context *ctx, fz_rect r, fz_stext_block *block) |
1604 | 0 | { |
1605 | 0 | fz_stext_block *after = NULL; |
1606 | |
|
1607 | 0 | for (; block != NULL; block = block->next) |
1608 | 0 | { |
1609 | 0 | fz_rect s = fz_intersect_rect(r, block->bbox); |
1610 | |
|
1611 | 0 | if (s.x0 > s.x1 || s.y0 > s.y1) |
1612 | 0 | continue; |
1613 | 0 | after = block; |
1614 | 0 | } |
1615 | | |
1616 | | /* Convert to before */ |
1617 | 0 | if (after) |
1618 | 0 | after = after->next; |
1619 | |
|
1620 | 0 | return after; |
1621 | 0 | } |
1622 | | |
1623 | | static fz_stext_struct * |
1624 | | transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent) |
1625 | 0 | { |
1626 | 0 | int w = gd->xpos->len; |
1627 | 0 | int h = gd->ypos->len; |
1628 | 0 | int x, y; |
1629 | 0 | char *sent_tab = fz_calloc(ctx, 1, w*h); |
1630 | 0 | fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; |
1631 | 0 | fz_stext_struct *table, *tr, *td; |
1632 | 0 | fz_stext_block *before; |
1633 | 0 | fz_rect r; |
1634 | | |
1635 | | /* Where should we insert the table in the data? */ |
1636 | 0 | r.x0 = gd->xpos->list[0].pos; |
1637 | 0 | r.x1 = gd->xpos->list[w-1].pos; |
1638 | 0 | r.y0 = gd->ypos->list[0].pos; |
1639 | 0 | r.y1 = gd->ypos->list[h-1].pos; |
1640 | 0 | before = find_table_insertion_point(ctx, r, *first_block); |
1641 | | |
1642 | | /* Make table */ |
1643 | 0 | table = add_struct_block_before(ctx, before, page, parent, FZ_STRUCTURE_TABLE, "Table"); |
1644 | | |
1645 | | /* Run through the cells, and guess at spanning. */ |
1646 | 0 | for (y = 0; y < h-1; y++) |
1647 | 0 | { |
1648 | | /* Have we sent this entire row before? */ |
1649 | 0 | for (x = 0; x < w-1; x++) |
1650 | 0 | { |
1651 | 0 | if (!sent_tab[x+y*w]) |
1652 | 0 | break; |
1653 | 0 | } |
1654 | 0 | if (x == w-1) |
1655 | 0 | continue; /* No point in sending a row with nothing in it! */ |
1656 | | |
1657 | | /* Make TR */ |
1658 | 0 | tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR"); |
1659 | |
|
1660 | 0 | for (x = 0; x < w-1; x++) |
1661 | 0 | { |
1662 | 0 | int x2, y2; |
1663 | 0 | int cellw = 1; |
1664 | 0 | int cellh = 1; |
1665 | | |
1666 | | /* Have we sent this cell already? */ |
1667 | 0 | if (sent_tab[x+y*w]) |
1668 | 0 | continue; |
1669 | | |
1670 | | /* Find the width of the cell */ |
1671 | 0 | for (x2 = x+1; x2 < w-1; x2++) |
1672 | 0 | { |
1673 | 0 | cell_t *cell = get_cell(gd->cells, x2, y); |
1674 | 0 | if (cell->v_line) |
1675 | 0 | break; /* Can't go past a line */ |
1676 | 0 | if (gd->xpos->list[x2].uncertainty == 0) |
1677 | 0 | break; /* An uncertainty of 0 is as good as a line. */ |
1678 | 0 | if (!cell->v_crossed) |
1679 | 0 | break; |
1680 | 0 | cellw++; |
1681 | 0 | } |
1682 | | /* Find the height of the cell */ |
1683 | 0 | for (y2 = y+1; y2 < h-1; y2++) |
1684 | 0 | { |
1685 | 0 | cell_t *cell; |
1686 | 0 | int h_crossed = 0; |
1687 | 0 | if (gd->ypos->list[y2].uncertainty == 0) |
1688 | 0 | break; /* An uncertainty of 0 is as good as a line. */ |
1689 | | |
1690 | 0 | cell = get_cell(gd->cells, x, y2); |
1691 | 0 | if (cell->h_line) |
1692 | 0 | break; /* Can't extend down through a line. */ |
1693 | 0 | if (cell->h_crossed) |
1694 | 0 | h_crossed = 1; |
1695 | 0 | for (x2 = x+1; x2 < x+cellw; x2++) |
1696 | 0 | { |
1697 | 0 | cell_t *cell = get_cell(gd->cells, x2, y2); |
1698 | 0 | if (cell->h_line) |
1699 | 0 | break; |
1700 | 0 | if (cell->v_line) |
1701 | 0 | break; /* Can't go past a line */ |
1702 | 0 | if (gd->xpos->list[x2].uncertainty == 0) |
1703 | 0 | break; /* An uncertainty of 0 is as good as a line. */ |
1704 | 0 | if (!cell->v_crossed) |
1705 | 0 | break; |
1706 | 0 | if (cell->h_crossed) |
1707 | 0 | h_crossed = 1; |
1708 | 0 | } |
1709 | 0 | if (x2 == x+cellw && h_crossed) |
1710 | 0 | cellh++; |
1711 | 0 | else |
1712 | 0 | break; |
1713 | 0 | } |
1714 | | /* Make TD */ |
1715 | 0 | td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD"); |
1716 | 0 | r.x0 = gd->xpos->list[x].pos; |
1717 | 0 | r.x1 = gd->xpos->list[x+cellw].pos; |
1718 | 0 | r.y0 = gd->ypos->list[y].pos; |
1719 | 0 | r.y1 = gd->ypos->list[y+cellh].pos; |
1720 | | /* Use r, not REAL contents bbox, as otherwise spanned rows |
1721 | | * can end up empty. */ |
1722 | 0 | td->up->bbox = r; |
1723 | 0 | move_contained_content(ctx, page, td, parent, r); |
1724 | | #ifdef DEBUG_TABLE_STRUCTURE |
1725 | | printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh); |
1726 | | #endif |
1727 | 0 | for (y2 = y; y2 < y+cellh; y2++) |
1728 | 0 | for (x2 = x; x2 < x+cellw; x2++) |
1729 | 0 | sent_tab[x2+y2*w] = 1; |
1730 | 0 | } |
1731 | 0 | r.x0 = gd->xpos->list[0].pos; |
1732 | 0 | r.x1 = gd->xpos->list[gd->xpos->len-1].pos; |
1733 | 0 | r.y0 = gd->ypos->list[y].pos; |
1734 | 0 | r.y1 = gd->ypos->list[y+1].pos; |
1735 | 0 | tr->up->bbox = r; |
1736 | 0 | table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox); |
1737 | 0 | } |
1738 | 0 | fz_free(ctx, sent_tab); |
1739 | |
|
1740 | 0 | return table; |
1741 | 0 | } |
1742 | | |
1743 | | static void |
1744 | | merge_column(grid_walker_data *gd, int x) |
1745 | 0 | { |
1746 | 0 | int y; |
1747 | 0 | for (y = 0; y < gd->cells->h; y++) |
1748 | 0 | { |
1749 | 0 | cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)]; |
1750 | 0 | cell_t *s = &gd->cells->cell[x + y * gd->cells->w]; |
1751 | |
|
1752 | 0 | if (x > 0) |
1753 | 0 | memcpy(d-x, s-x, x * sizeof(*d)); |
1754 | 0 | d->full = s[0].full || s[1].full; |
1755 | 0 | d->h_crossed = s[0].h_crossed || s[1].h_crossed; |
1756 | 0 | d->h_line = s[0].h_line; /* == s[1].h_line */ |
1757 | 0 | d->v_crossed = s[0].v_crossed; |
1758 | 0 | d->v_line = s[0].v_line; |
1759 | 0 | if (x < gd->cells->w - 2) |
1760 | 0 | memcpy(d+1, s+2, (gd->cells->w - 2 - x) * sizeof(*d)); |
1761 | 0 | } |
1762 | 0 | gd->cells->w--; |
1763 | |
|
1764 | 0 | if (x < gd->xpos->len - 2) |
1765 | 0 | memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], (gd->xpos->len - 2 - x) * sizeof(gd->xpos->list[0])); |
1766 | 0 | gd->xpos->len--; |
1767 | 0 | } |
1768 | | |
1769 | | static void |
1770 | | merge_columns(grid_walker_data *gd) |
1771 | 0 | { |
1772 | 0 | int x, y; |
1773 | |
|
1774 | 0 | for (x = gd->cells->w-3; x >= 0; x--) |
1775 | 0 | { |
1776 | | /* Can column x be merged with column x+1? */ |
1777 | | /* We never merge 2 columns where the uncertainty of the division between them is 0. */ |
1778 | 0 | if (gd->xpos->list[x+1].uncertainty == 0) |
1779 | 0 | break; |
1780 | | /* This requires all the pairs of cells in those 2 columns to be mergeable. */ |
1781 | 0 | for (y = 0; y < gd->cells->h-1; y++) |
1782 | 0 | { |
1783 | 0 | cell_t *a = get_cell(gd->cells, x, y); |
1784 | 0 | cell_t *b = get_cell(gd->cells, x+1, y); |
1785 | | /* If there is a divider, we can't merge. */ |
1786 | 0 | if (b->v_line) |
1787 | 0 | break; |
1788 | | /* If either is empty, we can merge. */ |
1789 | 0 | if (!a->full || !b->full) |
1790 | 0 | continue; |
1791 | | /* If we differ in h linedness, we can't merge */ |
1792 | 0 | if (!!a->h_line != !!b->h_line) |
1793 | 0 | break; |
1794 | | /* If both are full, we can only merge if we cross. */ |
1795 | 0 | if (a->full && b->full && b->v_crossed) |
1796 | 0 | continue; |
1797 | | /* Otherwise we can't merge */ |
1798 | 0 | break; |
1799 | 0 | } |
1800 | 0 | if (y == gd->cells->h-1) |
1801 | 0 | { |
1802 | | /* Merge the column! */ |
1803 | | #ifdef DEBUG_TABLE_STRUCTURE |
1804 | | printf("Merging column %d\n", x); |
1805 | | #endif |
1806 | 0 | merge_column(gd, x); |
1807 | | #ifdef DEBUG_TABLE_STRUCTURE |
1808 | | asciiart_table(gd); |
1809 | | #endif |
1810 | 0 | } |
1811 | 0 | } |
1812 | 0 | } |
1813 | | |
1814 | | static void |
1815 | | merge_row(grid_walker_data *gd, int y) |
1816 | 0 | { |
1817 | 0 | int x; |
1818 | 0 | int w = gd->cells->w; |
1819 | 0 | cell_t *d = &gd->cells->cell[y * w]; |
1820 | 0 | for (x = 0; x < gd->cells->w-1; x++) |
1821 | 0 | { |
1822 | 0 | if (d->full == 0) |
1823 | 0 | d->full = d[w].full; |
1824 | 0 | if (d->h_crossed == 0) |
1825 | 0 | d->h_crossed = d[w].h_crossed; |
1826 | 0 | d++; |
1827 | 0 | } |
1828 | 0 | if (y < gd->cells->h - 2) |
1829 | 0 | memcpy(d, d+w, (gd->cells->h - 2 - y) * w * sizeof(*d)); |
1830 | 0 | gd->cells->h--; |
1831 | |
|
1832 | 0 | if (y < gd->ypos->len - 2) |
1833 | 0 | memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], (gd->ypos->len - 2 - y) * sizeof(gd->ypos->list[0])); |
1834 | 0 | gd->ypos->len--; |
1835 | 0 | } |
1836 | | |
1837 | | static void |
1838 | | merge_rows(grid_walker_data *gd) |
1839 | 0 | { |
1840 | 0 | int x, y; |
1841 | |
|
1842 | 0 | for (y = gd->cells->h-3; y >= 0; y--) |
1843 | 0 | { |
1844 | | /* Can row y be merged with row y+1? */ |
1845 | | /* We never merge 2 rows where the uncertainty of the division between them is 0. */ |
1846 | 0 | if (gd->ypos->list[y+1].uncertainty == 0) |
1847 | 0 | break; |
1848 | | /* This requires all the pairs of cells in those 2 rows to be mergeable. */ |
1849 | 0 | for (x = 0; x < gd->cells->w-1; x++) |
1850 | 0 | { |
1851 | 0 | cell_t *a = get_cell(gd->cells, x, y); |
1852 | 0 | cell_t *b = get_cell(gd->cells, x, y+1); |
1853 | | /* If there is a divider, we can't merge. */ |
1854 | 0 | if (b->h_line) |
1855 | 0 | break; |
1856 | | /* If either is empty, we can merge. */ |
1857 | 0 | if (!a->full || !b->full) |
1858 | 0 | continue; |
1859 | | /* If we differ in v linedness, we can't merge */ |
1860 | 0 | if (!!a->v_line != !!b->v_line) |
1861 | 0 | break; |
1862 | | /* If both are full, we can only merge if we cross. */ |
1863 | 0 | if (a->full && b->full && b->h_crossed) |
1864 | 0 | continue; |
1865 | | /* Otherwise we can't merge */ |
1866 | 0 | break; |
1867 | 0 | } |
1868 | 0 | if (x == gd->cells->w-1) |
1869 | 0 | { |
1870 | | /* Merge the row! */ |
1871 | | #ifdef DEBUG_TABLE_STRUCTURE |
1872 | | printf("Merging row %d\n", y); |
1873 | | #endif |
1874 | 0 | merge_row(gd, y); |
1875 | | #ifdef DEBUG_TABLE_STRUCTURE |
1876 | | asciiart_table(gd); |
1877 | | #endif |
1878 | 0 | } |
1879 | 0 | } |
1880 | 0 | } |
1881 | | |
1882 | | static int |
1883 | | tr_is_empty(fz_context *ctx, fz_stext_block *block) |
1884 | 0 | { |
1885 | 0 | for (; block != NULL; block = block->next) |
1886 | 0 | { |
1887 | 0 | if (block->type != FZ_STEXT_BLOCK_STRUCT) |
1888 | 0 | return 0; |
1889 | 0 | if (!block->u.s.down) |
1890 | 0 | { |
1891 | 0 | } |
1892 | 0 | else if (block->u.s.down->standard != FZ_STRUCTURE_TD) |
1893 | 0 | return 0; |
1894 | 0 | else if (block->u.s.down->first_block != NULL) |
1895 | 0 | return 0; |
1896 | 0 | } |
1897 | | |
1898 | 0 | return 1; |
1899 | 0 | } |
1900 | | |
1901 | | static int |
1902 | | table_is_empty(fz_context *ctx, fz_stext_block *block) |
1903 | 0 | { |
1904 | 0 | for (; block != NULL; block = block->next) |
1905 | 0 | { |
1906 | 0 | if (block->type == FZ_STEXT_BLOCK_GRID) |
1907 | 0 | continue; |
1908 | 0 | if (block->type != FZ_STEXT_BLOCK_STRUCT) |
1909 | 0 | return 0; |
1910 | 0 | if (!block->u.s.down) |
1911 | 0 | { |
1912 | 0 | } |
1913 | 0 | else if (block->u.s.down->standard != FZ_STRUCTURE_TR) |
1914 | 0 | return 0; |
1915 | 0 | else if (!tr_is_empty(ctx, block->u.s.down->first_block)) |
1916 | 0 | return 0; |
1917 | 0 | } |
1918 | | |
1919 | 0 | return 1; |
1920 | 0 | } |
1921 | | |
1922 | | static void |
1923 | | tidy_td_divs(fz_context *ctx, fz_stext_struct *parent) |
1924 | 0 | { |
1925 | 0 | while (1) |
1926 | 0 | { |
1927 | 0 | fz_stext_block *block = parent->first_block; |
1928 | |
|
1929 | 0 | if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV) |
1930 | 0 | return; |
1931 | | |
1932 | 0 | parent->first_block = block->u.s.down->first_block; |
1933 | 0 | parent->last_block = block->u.s.down->last_block; |
1934 | 0 | } |
1935 | 0 | } |
1936 | | |
1937 | | static void |
1938 | | tidy_tr_divs(fz_context *ctx, fz_stext_block *block) |
1939 | 0 | { |
1940 | 0 | for (; block != NULL; block = block->next) |
1941 | 0 | { |
1942 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD) |
1943 | 0 | tidy_td_divs(ctx, block->u.s.down); |
1944 | 0 | } |
1945 | 0 | } |
1946 | | |
1947 | | static void |
1948 | | tidy_table_divs(fz_context *ctx, fz_stext_block *block) |
1949 | 0 | { |
1950 | 0 | for (; block != NULL; block = block->next) |
1951 | 0 | { |
1952 | 0 | if (block->type == FZ_STEXT_BLOCK_GRID) |
1953 | 0 | continue; |
1954 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR) |
1955 | 0 | tidy_tr_divs(ctx, block->u.s.down->first_block); |
1956 | 0 | } |
1957 | 0 | } |
1958 | | |
1959 | | static int |
1960 | | div_is_empty(fz_context *ctx, fz_stext_block *block) |
1961 | 0 | { |
1962 | 0 | for (; block != NULL; block = block->next) |
1963 | 0 | { |
1964 | 0 | if (block->type != FZ_STEXT_BLOCK_STRUCT) |
1965 | 0 | return 0; |
1966 | 0 | if (!block->u.s.down) |
1967 | 0 | { |
1968 | 0 | } |
1969 | 0 | else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) |
1970 | 0 | { |
1971 | 0 | tidy_table_divs(ctx, block->u.s.down->first_block); |
1972 | 0 | return table_is_empty(ctx, block->u.s.down->first_block); |
1973 | 0 | } |
1974 | 0 | else if (block->u.s.down->standard != FZ_STRUCTURE_DIV) |
1975 | 0 | return 0; |
1976 | 0 | else if (!div_is_empty(ctx, block->u.s.down->first_block)) |
1977 | 0 | return 0; |
1978 | 0 | } |
1979 | | |
1980 | 0 | return 1; |
1981 | 0 | } |
1982 | | |
1983 | | static void |
1984 | | tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) |
1985 | 0 | { |
1986 | 0 | fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block; |
1987 | 0 | fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block; |
1988 | 0 | fz_stext_block *block, *next_block; |
1989 | |
|
1990 | 0 | for (block = *first_blockp; block != NULL; block = next_block) |
1991 | 0 | { |
1992 | 0 | next_block = block->next; |
1993 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) |
1994 | 0 | { |
1995 | 0 | if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) |
1996 | 0 | { |
1997 | 0 | tidy_table_divs(ctx, block->u.s.down->first_block); |
1998 | 0 | if (table_is_empty(ctx, block->u.s.down->first_block)) |
1999 | 0 | { |
2000 | | /* Remove block */ |
2001 | 0 | if (block->prev) |
2002 | 0 | { |
2003 | 0 | assert(block->prev->next == block); |
2004 | 0 | block->prev->next = next_block; |
2005 | 0 | } |
2006 | 0 | else |
2007 | 0 | { |
2008 | 0 | assert(*first_blockp == block); |
2009 | 0 | *first_blockp = next_block; |
2010 | 0 | } |
2011 | 0 | if (next_block) |
2012 | 0 | { |
2013 | 0 | assert(next_block->prev == block); |
2014 | 0 | next_block->prev = block->prev; |
2015 | 0 | } |
2016 | 0 | else |
2017 | 0 | { |
2018 | 0 | assert(*last_blockp == block); |
2019 | 0 | *last_blockp = block->prev; |
2020 | 0 | } |
2021 | 0 | } |
2022 | 0 | else |
2023 | 0 | { |
2024 | 0 | tidy_orphaned_tables(ctx, page, block->u.s.down); |
2025 | 0 | } |
2026 | 0 | } |
2027 | 0 | if (block->u.s.down->standard == FZ_STRUCTURE_DIV) |
2028 | 0 | { |
2029 | 0 | tidy_orphaned_tables(ctx, page, block->u.s.down); |
2030 | 0 | if (div_is_empty(ctx, block->u.s.down->first_block)) |
2031 | 0 | { |
2032 | | /* Remove block */ |
2033 | 0 | if (block->prev) |
2034 | 0 | { |
2035 | 0 | assert(block->prev->next == block); |
2036 | 0 | block->prev->next = next_block; |
2037 | 0 | } |
2038 | 0 | else |
2039 | 0 | { |
2040 | 0 | assert(*first_blockp == block); |
2041 | 0 | *first_blockp = next_block; |
2042 | 0 | } |
2043 | 0 | if (next_block) |
2044 | 0 | { |
2045 | 0 | assert(next_block->prev == block); |
2046 | 0 | next_block->prev = block->prev; |
2047 | 0 | } |
2048 | 0 | else |
2049 | 0 | { |
2050 | 0 | assert(*last_blockp == block); |
2051 | 0 | *last_blockp = block->prev; |
2052 | 0 | } |
2053 | 0 | } |
2054 | 0 | } |
2055 | 0 | } |
2056 | 0 | } |
2057 | 0 | } |
2058 | | |
2059 | | static fz_stext_struct * |
2060 | | check_for_grid_lines(fz_context *ctx, fz_stext_grid_positions *xps, fz_stext_grid_positions *yps, fz_stext_page *page, fz_stext_struct *parent, int num_subtables) |
2061 | 0 | { |
2062 | 0 | fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block; |
2063 | 0 | grid_walker_data gd = { 0 }; |
2064 | 0 | fz_stext_struct *table = NULL; |
2065 | |
|
2066 | 0 | gd.xpos = xps; |
2067 | 0 | gd.ypos = yps; |
2068 | |
|
2069 | 0 | fz_var(gd); |
2070 | |
|
2071 | 0 | fz_try(ctx) |
2072 | 0 | { |
2073 | 0 | gd.cells = new_cells(ctx, xps->len, yps->len); |
2074 | | |
2075 | | /* First we walk the content looking for grid lines. These |
2076 | | * lines refine our positions. */ |
2077 | 0 | walk_grid_lines(ctx, &gd, *first_blockp); |
2078 | | /* Now, we walk the content looking for content that crosses |
2079 | | * these grid lines. This allows us to spot spanned cells. */ |
2080 | 0 | erase_grid_lines(ctx, &gd, *first_blockp); |
2081 | |
|
2082 | | #ifdef DEBUG_TABLE_STRUCTURE |
2083 | | asciiart_table(&gd); |
2084 | | #endif |
2085 | | /* Now, can we remove some columns or rows? i.e. have we oversegmented? */ |
2086 | 0 | merge_columns(&gd); |
2087 | 0 | merge_rows(&gd); |
2088 | | |
2089 | | /* Did we shrink the table so much it's not a table any more? */ |
2090 | 0 | if (gd.xpos->len < 3 || gd.ypos->len < 3) |
2091 | 0 | break; |
2092 | | |
2093 | 0 | if (num_subtables > 0) |
2094 | 0 | { |
2095 | | /* We are risking throwing away a table we've already found for this |
2096 | | * one. Only do it if this one is really convincing. */ |
2097 | 0 | int x, y; |
2098 | 0 | for (x = 0; x < gd.xpos->len; x++) |
2099 | 0 | if (gd.xpos->list[x].uncertainty != 0) |
2100 | 0 | break; |
2101 | 0 | if (x != gd.xpos->len) |
2102 | 0 | break; |
2103 | 0 | for (y = 0; y < gd.ypos->len; y++) |
2104 | 0 | if (gd.ypos->list[y].uncertainty != 0) |
2105 | 0 | break; |
2106 | 0 | if (y != gd.ypos->len) |
2107 | 0 | break; |
2108 | 0 | } |
2109 | | |
2110 | | #ifdef DEBUG_TABLE_STRUCTURE |
2111 | | printf("Transcribing table: (%g,%g)->(%g,%g)\n", |
2112 | | gd.xpos->list[0].pos, |
2113 | | gd.ypos->list[0].pos, |
2114 | | gd.xpos->list[gd.xpos->len-1].pos, |
2115 | | gd.ypos->list[gd.ypos->len-1].pos); |
2116 | | #endif |
2117 | | |
2118 | | /* Now we should have the entire table calculated. */ |
2119 | 0 | table = transcribe_table(ctx, &gd, page, parent); |
2120 | |
|
2121 | 0 | tidy_orphaned_tables(ctx, page, parent); |
2122 | 0 | } |
2123 | 0 | fz_always(ctx) |
2124 | 0 | { |
2125 | 0 | fz_free(ctx, gd.cells); |
2126 | 0 | } |
2127 | 0 | fz_catch(ctx) |
2128 | 0 | fz_rethrow(ctx); |
2129 | | |
2130 | 0 | return table; |
2131 | 0 | } |
2132 | | |
2133 | | static fz_rect |
2134 | | bbox_of_blocks(fz_stext_block *block) |
2135 | 0 | { |
2136 | 0 | fz_rect r = fz_empty_rect; |
2137 | |
|
2138 | 0 | while (block) |
2139 | 0 | { |
2140 | 0 | r = fz_union_rect(r, block->bbox); |
2141 | 0 | block = block->next; |
2142 | 0 | } |
2143 | |
|
2144 | 0 | return r; |
2145 | 0 | } |
2146 | | |
2147 | | static int |
2148 | | do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) |
2149 | 0 | { |
2150 | 0 | div_list xs = { 0 }; |
2151 | 0 | div_list ys = { 0 }; |
2152 | 0 | fz_stext_block *block; |
2153 | 0 | int count; |
2154 | 0 | fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; |
2155 | 0 | fz_stext_grid_positions *xps = NULL; |
2156 | 0 | fz_stext_grid_positions *yps = NULL; |
2157 | 0 | int num_subtables = 0; |
2158 | | |
2159 | | /* No content? Just bale. */ |
2160 | 0 | if (*first_block == NULL) |
2161 | 0 | return 0; |
2162 | | |
2163 | | /* First off, descend into any children to see if those look like tables. */ |
2164 | 0 | count = 0; |
2165 | 0 | for (block = *first_block; block != NULL; block = block->next) |
2166 | 0 | { |
2167 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
2168 | 0 | { |
2169 | 0 | if (block->u.s.down) |
2170 | 0 | { |
2171 | 0 | num_subtables += do_table_hunt(ctx, page, block->u.s.down); |
2172 | 0 | count++; |
2173 | 0 | } |
2174 | 0 | } |
2175 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
2176 | 0 | count++; |
2177 | 0 | } |
2178 | | |
2179 | | /* If we don't have at least a single child, no more to hunt. */ |
2180 | | /* We only need a single block, because a single text block can |
2181 | | * contain an entire unbordered table. */ |
2182 | 0 | if (count < 1) |
2183 | 0 | return num_subtables; |
2184 | | |
2185 | | /* We only look for a table at this level if we either at the top |
2186 | | * or on a div. This saves us looking for tables within an 'H' |
2187 | | * for example. */ |
2188 | 0 | if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV) |
2189 | 0 | return num_subtables; |
2190 | | |
2191 | | /* If all the content here looks like a column of text, don't |
2192 | | * hunt for a table within it. */ |
2193 | 0 | if (all_blocks_are_justified_or_headers(ctx, *first_block)) |
2194 | 0 | return num_subtables; |
2195 | | |
2196 | 0 | fz_var(xps); |
2197 | 0 | fz_var(yps); |
2198 | |
|
2199 | 0 | fz_try(ctx) |
2200 | 0 | { |
2201 | | /* Now see whether the content looks like tables. |
2202 | | * Currently, we pass descend == 1, which means we consider all the |
2203 | | * content at this level, plus any children. This allows us to cope |
2204 | | * with having oversegmented. */ |
2205 | 0 | walk_blocks(ctx, &xs, &ys, *first_block, 1); |
2206 | |
|
2207 | 0 | sanitize_positions(ctx, &xs); |
2208 | 0 | sanitize_positions(ctx, &ys); |
2209 | | |
2210 | | /* Run across the line, counting 'winding' */ |
2211 | 0 | if (xs.len > 2 && ys.len > 2) |
2212 | 0 | { |
2213 | 0 | fz_stext_struct *table; |
2214 | 0 | fz_rect rect = bbox_of_blocks(*first_block); |
2215 | 0 | xps = make_table_positions(ctx, &xs, rect.x0, rect.x1); |
2216 | 0 | yps = make_table_positions(ctx, &ys, rect.y0, rect.y1); |
2217 | 0 | table = check_for_grid_lines(ctx, xps, yps, page, parent, num_subtables); |
2218 | |
|
2219 | 0 | if (table != NULL) |
2220 | 0 | { |
2221 | 0 | fz_stext_block *block; |
2222 | 0 | fz_stext_grid_positions *xps2 = clone_grid_positions(ctx, page, xps); |
2223 | 0 | fz_stext_grid_positions *yps2 = clone_grid_positions(ctx, page, yps); |
2224 | 0 | block = add_grid_block(ctx, page, &table->first_block, &table->last_block); |
2225 | 0 | block->u.b.xs = xps2; |
2226 | 0 | block->u.b.ys = yps2; |
2227 | 0 | block->bbox.x0 = block->u.b.xs->list[0].pos; |
2228 | 0 | block->bbox.y0 = block->u.b.ys->list[0].pos; |
2229 | 0 | block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos; |
2230 | 0 | block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos; |
2231 | 0 | num_subtables = 1; |
2232 | 0 | } |
2233 | | #ifdef DEBUG_WRITE_AS_PS |
2234 | | { |
2235 | | int i; |
2236 | | printf("%% TABLE\n"); |
2237 | | for (i = 0; i < block->u.b.xs->len; i++) |
2238 | | { |
2239 | | if (block->u.b.xs->list[i].uncertainty) |
2240 | | printf("0 1 0 setrgbcolor\n"); |
2241 | | else |
2242 | | printf("0 0.5 0 setrgbcolor\n"); |
2243 | | printf("%g %g moveto %g %g lineto stroke\n", |
2244 | | block->u.b.xs->list[i].pos, block->bbox.y0, |
2245 | | block->u.b.xs->list[i].pos, block->bbox.y1); |
2246 | | } |
2247 | | for (i = 0; i < block->u.b.ys->len; i++) |
2248 | | { |
2249 | | if (block->u.b.ys->list[i].uncertainty) |
2250 | | printf("0 1 0 setrgbcolor\n"); |
2251 | | else |
2252 | | printf("0 0.5 0 setrgbcolor\n"); |
2253 | | printf("%g %g moveto %g %g lineto stroke\n", |
2254 | | block->bbox.x0, block->u.b.ys->list[i].pos, |
2255 | | block->bbox.x1, block->u.b.ys->list[i].pos); |
2256 | | } |
2257 | | } |
2258 | | #endif |
2259 | 0 | } |
2260 | 0 | } |
2261 | 0 | fz_always(ctx) |
2262 | 0 | { |
2263 | 0 | fz_free(ctx, xs.list); |
2264 | 0 | fz_free(ctx, ys.list); |
2265 | 0 | fz_free(ctx, xps); |
2266 | 0 | fz_free(ctx, yps); |
2267 | 0 | } |
2268 | 0 | fz_catch(ctx) |
2269 | 0 | fz_rethrow(ctx); |
2270 | | |
2271 | 0 | return num_subtables; |
2272 | 0 | } |
2273 | | |
2274 | | void |
2275 | | fz_table_hunt(fz_context *ctx, fz_stext_page *page) |
2276 | 0 | { |
2277 | 0 | if (page == NULL) |
2278 | 0 | return; |
2279 | | |
2280 | 0 | assert(verify_stext(ctx, page, NULL)); |
2281 | | |
2282 | 0 | do_table_hunt(ctx, page, NULL); |
2283 | |
|
2284 | 0 | assert(verify_stext(ctx, page, NULL)); |
2285 | 0 | } |