/src/mupdf/source/fitz/stext-boxer.c
Line | Count | Source |
1 | | // Copyright (C) 2025 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | |
25 | | #undef DEBUG_WRITE_AS_PS |
26 | | #undef DEBUG_STRUCT |
27 | | |
28 | | typedef struct boxer_s boxer_t; |
29 | | |
30 | | typedef struct { |
31 | | int len; |
32 | | int max; |
33 | | double fudge; |
34 | | fz_rect list[FZ_FLEXIBLE_ARRAY]; |
35 | | } rectlist_t; |
36 | | |
37 | | struct boxer_s { |
38 | | fz_rect mediabox; |
39 | | rectlist_t *list; |
40 | | int tight; |
41 | | }; |
42 | | |
43 | | static rectlist_t * |
44 | | rectlist_create(fz_context *ctx, int max, double fudge) |
45 | 0 | { |
46 | 0 | rectlist_t *list = fz_malloc_flexible(ctx, rectlist_t, list, max); |
47 | |
|
48 | 0 | list->len = 0; |
49 | 0 | list->max = max; |
50 | 0 | list->fudge = fudge; |
51 | |
|
52 | 0 | return list; |
53 | 0 | } |
54 | | |
55 | | /* Push box onto rectlist, unless it is completely enclosed by |
56 | | * another box, or completely encloses others (in which case they |
57 | | * are replaced by it). */ |
58 | | static void |
59 | | rectlist_append(rectlist_t *list, fz_rect *box) |
60 | 0 | { |
61 | 0 | int i; |
62 | | /* We allow ourselves a fudge factor when checking for inclusion. |
63 | | * This is either 4 points, or 0 points, depending on whether |
64 | | * we are running in 'tight' mode or not. */ |
65 | 0 | double r_fudge = list->fudge; |
66 | |
|
67 | 0 | for (i = 0; i < list->len; i++) |
68 | 0 | { |
69 | 0 | fz_rect *r = &list->list[i]; |
70 | 0 | fz_rect smaller, larger; |
71 | |
|
72 | 0 | smaller.x0 = r->x0 + r_fudge; |
73 | 0 | larger. x0 = r->x0 - r_fudge; |
74 | 0 | smaller.y0 = r->y0 + r_fudge; |
75 | 0 | larger. y0 = r->y0 - r_fudge; |
76 | 0 | smaller.x1 = r->x1 - r_fudge; |
77 | 0 | larger. x1 = r->x1 + r_fudge; |
78 | 0 | smaller.y1 = r->y1 - r_fudge; |
79 | 0 | larger. y1 = r->y1 + r_fudge; |
80 | |
|
81 | 0 | if (fz_contains_rect(larger, *box)) |
82 | 0 | return; /* box is enclosed! Nothing to do. */ |
83 | 0 | if (fz_contains_rect(*box, smaller)) |
84 | 0 | { |
85 | | /* box encloses r. Ditch r. */ |
86 | | /* Shorten the list */ |
87 | 0 | --list->len; |
88 | | /* If the one that just got chopped off wasn't r, move it down. */ |
89 | 0 | if (i < list->len) |
90 | 0 | { |
91 | 0 | memmove(r, &list->list[list->len], sizeof(*r)); |
92 | 0 | i--; /* Reconsider this entry next time. */ |
93 | 0 | } |
94 | 0 | } |
95 | 0 | } |
96 | | |
97 | 0 | assert(list->len < list->max); |
98 | 0 | memcpy(&list->list[list->len], box, sizeof(*box)); |
99 | 0 | list->len++; |
100 | 0 | } |
101 | | |
102 | | static boxer_t * |
103 | | boxer_create_length(fz_context *ctx, fz_rect *mediabox, int len, int tight) |
104 | 0 | { |
105 | 0 | boxer_t *boxer = fz_malloc_struct(ctx, boxer_t); |
106 | |
|
107 | 0 | if (boxer == NULL) |
108 | 0 | return NULL; |
109 | | |
110 | 0 | memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox)); |
111 | 0 | boxer->list = rectlist_create(ctx, len, tight ? 0 : 4); |
112 | 0 | boxer->tight = tight; |
113 | |
|
114 | 0 | return boxer; |
115 | 0 | } |
116 | | |
117 | | /* Create a boxer structure for a page of size mediabox. */ |
118 | | static boxer_t * |
119 | | boxer_create(fz_context *ctx, fz_rect *mediabox, int tight) |
120 | 0 | { |
121 | 0 | boxer_t *boxer = boxer_create_length(ctx, mediabox, 1, tight); |
122 | |
|
123 | 0 | if (boxer == NULL) |
124 | 0 | return NULL; |
125 | 0 | rectlist_append(boxer->list, mediabox); |
126 | |
|
127 | 0 | return boxer; |
128 | 0 | } |
129 | | |
130 | | static void |
131 | | push_if_intersect_suitable(rectlist_t *dst, const fz_rect *a, const fz_rect *b) |
132 | 0 | { |
133 | 0 | fz_rect c; |
134 | | |
135 | | /* Intersect a and b. */ |
136 | 0 | c = fz_intersect_rect(*a, *b); |
137 | | /* If no intersection, nothing to push. */ |
138 | 0 | if (!fz_is_valid_rect(c)) |
139 | 0 | return; |
140 | | |
141 | 0 | rectlist_append(dst, &c); |
142 | 0 | } |
143 | | |
144 | | static void |
145 | | boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const fz_rect *box) |
146 | 0 | { |
147 | 0 | int i; |
148 | |
|
149 | 0 | for (i = 0; i < src->len; i++) |
150 | 0 | push_if_intersect_suitable(dst, &src->list[i], box); |
151 | 0 | } |
152 | | |
153 | | /* Mark a given box as being occupied (typically by a glyph) */ |
154 | | static void boxer_feed(fz_context *ctx, boxer_t *boxer, fz_rect *bbox) |
155 | 0 | { |
156 | 0 | fz_rect box; |
157 | | /* When we feed a box into a the boxer, we can never make |
158 | | * the list more than 4 times as long. */ |
159 | 0 | rectlist_t *newlist = rectlist_create(ctx, boxer->list->len * 4, boxer->list->fudge); |
160 | |
|
161 | | #ifdef DEBUG_WRITE_AS_PS |
162 | | printf("0 0 1 setrgbcolor\n"); |
163 | | printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n", |
164 | | bbox->x0, bbox->y0, |
165 | | bbox->x0, bbox->y1, |
166 | | bbox->x1, bbox->y1, |
167 | | bbox->x1, bbox->y0 |
168 | | ); |
169 | | #endif |
170 | | |
171 | | /* Left (0,0) (x0,H) */ |
172 | 0 | box.x0 = boxer->mediabox.x0; |
173 | 0 | box.y0 = boxer->mediabox.y0; |
174 | 0 | box.x1 = bbox->x0; |
175 | 0 | box.y1 = boxer->mediabox.y1; |
176 | 0 | boxlist_feed_intersect(newlist, boxer->list, &box); |
177 | | |
178 | | /* Right (x1,0) (W,H) */ |
179 | 0 | box.x0 = bbox->x1; |
180 | 0 | box.y0 = boxer->mediabox.y0; |
181 | 0 | box.x1 = boxer->mediabox.x1; |
182 | 0 | box.y1 = boxer->mediabox.y1; |
183 | 0 | boxlist_feed_intersect(newlist, boxer->list, &box); |
184 | | |
185 | | /* Bottom (0,0) (W,y0) */ |
186 | 0 | box.x0 = boxer->mediabox.x0; |
187 | 0 | box.y0 = boxer->mediabox.y0; |
188 | 0 | box.x1 = boxer->mediabox.x1; |
189 | 0 | box.y1 = bbox->y0; |
190 | 0 | boxlist_feed_intersect(newlist, boxer->list, &box); |
191 | | |
192 | | /* Top (0,y1) (W,H) */ |
193 | 0 | box.x0 = boxer->mediabox.x0; |
194 | 0 | box.y0 = bbox->y1; |
195 | 0 | box.x1 = boxer->mediabox.x1; |
196 | 0 | box.y1 = boxer->mediabox.y1; |
197 | 0 | boxlist_feed_intersect(newlist, boxer->list, &box); |
198 | |
|
199 | 0 | fz_free(ctx, boxer->list); |
200 | 0 | boxer->list = newlist; |
201 | 0 | } |
202 | | |
203 | | #ifdef DEBUG_WRITE_AS_PS |
204 | | static int |
205 | | compare_areas(const void *a_, const void *b_) |
206 | | { |
207 | | const fz_rect *a = (const fz_rect *)a_; |
208 | | const fz_rect *b = (const fz_rect *)b_; |
209 | | double area_a = (a->x1-a->x0) * (a->y1-a->y0); |
210 | | double area_b = (b->x1-b->x0) * (b->y1-b->y0); |
211 | | |
212 | | if (area_a < area_b) |
213 | | return 1; |
214 | | else if (area_a > area_b) |
215 | | return -1; |
216 | | else |
217 | | return 0; |
218 | | } |
219 | | |
220 | | /* Sort the rectangle list to be largest area first. For ease of humans |
221 | | * reading debug output. */ |
222 | | static void boxer_sort(boxer_t *boxer) |
223 | | { |
224 | | qsort(boxer->list->list, boxer->list->len, sizeof(fz_rect), compare_areas); |
225 | | } |
226 | | |
227 | | /* Get the rectangle list for a given boxer. Return value is the length of |
228 | | * the list. Lifespan is until the boxer is modified or freed. */ |
229 | | static int boxer_results(boxer_t *boxer, fz_rect **list) |
230 | | { |
231 | | *list = boxer->list->list; |
232 | | return boxer->list->len; |
233 | | } |
234 | | #endif |
235 | | |
236 | | /* Currently unused debugging routine */ |
237 | | #if 0 |
238 | | static void |
239 | | boxer_dump(fz_context *ctx, boxer_t *boxer) |
240 | | { |
241 | | int i; |
242 | | |
243 | | printf("bbox = %g %g %g %g\n", boxer->mediabox.x0, boxer->mediabox.y0, boxer->mediabox.x1, boxer->mediabox.y1); |
244 | | for (i = 0; i < boxer->list->len; i++) |
245 | | { |
246 | | printf("%d %g %g %g %g\n", i, boxer->list->list[i].x0, boxer->list->list[i].y0, boxer->list->list[i].x1, boxer->list->list[i].y1); |
247 | | } |
248 | | } |
249 | | #endif |
250 | | |
251 | | /* Destroy a boxer. */ |
252 | | static void boxer_destroy(fz_context *ctx, boxer_t *boxer) |
253 | 0 | { |
254 | 0 | if (!boxer) |
255 | 0 | return; |
256 | | |
257 | 0 | fz_free(ctx, boxer->list); |
258 | 0 | fz_free(ctx, boxer); |
259 | 0 | } |
260 | | |
261 | | /* Find the margins for a given boxer. */ |
262 | | static fz_rect boxer_margins(boxer_t *boxer) |
263 | 0 | { |
264 | 0 | rectlist_t *list = boxer->list; |
265 | 0 | int i; |
266 | 0 | fz_rect margins = boxer->mediabox; |
267 | |
|
268 | 0 | for (i = 0; i < list->len; i++) |
269 | 0 | { |
270 | 0 | fz_rect *r = &list->list[i]; |
271 | 0 | if (r->x0 <= margins.x0 && r->y0 <= margins.y0 && r->y1 >= margins.y1) |
272 | 0 | margins.x0 = r->x1; /* Left Margin */ |
273 | 0 | else if (r->x1 >= margins.x1 && r->y0 <= margins.y0 && r->y1 >= margins.y1) |
274 | 0 | margins.x1 = r->x0; /* Right Margin */ |
275 | 0 | else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y0 <= margins.y0) |
276 | 0 | margins.y0 = r->y1; /* Top Margin */ |
277 | 0 | else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y1 >= margins.y1) |
278 | 0 | margins.y1 = r->y0; /* Bottom Margin */ |
279 | 0 | } |
280 | |
|
281 | 0 | return margins; |
282 | 0 | } |
283 | | |
284 | | /* Create a new boxer from a subset of an old one. */ |
285 | | static boxer_t *boxer_subset(fz_context *ctx, boxer_t *boxer, fz_rect rect) |
286 | 0 | { |
287 | 0 | boxer_t *new_boxer = boxer_create_length(ctx, &rect, boxer->list->len, boxer->tight); |
288 | 0 | int i; |
289 | |
|
290 | 0 | if (new_boxer == NULL) |
291 | 0 | return NULL; |
292 | | |
293 | 0 | for (i = 0; i < boxer->list->len; i++) |
294 | 0 | { |
295 | 0 | fz_rect r = fz_intersect_rect(boxer->list->list[i], rect); |
296 | |
|
297 | 0 | if (fz_is_empty_rect(r)) |
298 | 0 | continue; |
299 | 0 | rectlist_append(new_boxer->list, &r); |
300 | 0 | } |
301 | |
|
302 | 0 | return new_boxer; |
303 | 0 | } |
304 | | |
305 | | static int analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *big_boxer, int depth); |
306 | | |
307 | | static void |
308 | | analyse_subset(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *boxer, fz_rect r, int depth) |
309 | 0 | { |
310 | 0 | boxer_t *sub_box = boxer_subset(ctx, boxer, r); |
311 | |
|
312 | 0 | fz_try(ctx) |
313 | 0 | (void)analyse_sub(ctx, page, parent, sub_box, depth); |
314 | 0 | fz_always(ctx) |
315 | 0 | boxer_destroy(ctx, sub_box); |
316 | 0 | fz_catch(ctx) |
317 | 0 | fz_rethrow(ctx); |
318 | 0 | } |
319 | | |
320 | | /* Consider a boxer for subdivision. |
321 | | * Returns 0 if no suitable subdivision point found. |
322 | | * Returns 1 if a subdivision point is found.*/ |
323 | | static int |
324 | | boxer_subdivide(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *boxer, int depth) |
325 | 0 | { |
326 | 0 | rectlist_t *list = boxer->list; |
327 | 0 | double max_size = 0; |
328 | 0 | int i; |
329 | 0 | int horiz = 0; |
330 | 0 | int largest = -1; |
331 | 0 | fz_rect r; |
332 | |
|
333 | 0 | for (i = 0; i < list->len; i++) |
334 | 0 | { |
335 | 0 | r = list->list[i]; |
336 | |
|
337 | 0 | if (r.x0 <= boxer->mediabox.x0 && r.x1 >= boxer->mediabox.x1) |
338 | 0 | { |
339 | | /* Horizontal divider */ |
340 | 0 | double size = r.y1 - r.y0; |
341 | 0 | if (size > max_size) |
342 | 0 | { |
343 | 0 | max_size = size; |
344 | 0 | largest = i; |
345 | 0 | horiz = 1; |
346 | 0 | } |
347 | 0 | } |
348 | 0 | if (r.y0 <= boxer->mediabox.y0 && r.y1 >= boxer->mediabox.y1) |
349 | 0 | { |
350 | | /* Vertical divider */ |
351 | 0 | double size = r.x1 - r.x0; |
352 | 0 | if (size > max_size) |
353 | 0 | { |
354 | 0 | max_size = size; |
355 | 0 | largest = i; |
356 | 0 | horiz = 0; |
357 | 0 | } |
358 | 0 | } |
359 | 0 | } |
360 | |
|
361 | 0 | if (largest == -1) |
362 | 0 | { |
363 | | #ifdef DEBUG_WRITE_AS_PS |
364 | | { |
365 | | printf("%% SUBSET\n"); |
366 | | printf("0 1 1 setrgbcolor\n"); |
367 | | printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n", |
368 | | boxer->mediabox.x0, boxer->mediabox.y0, |
369 | | boxer->mediabox.x0, boxer->mediabox.y1, |
370 | | boxer->mediabox.x1, boxer->mediabox.y1, |
371 | | boxer->mediabox.x1, boxer->mediabox.y0); |
372 | | } |
373 | | #endif |
374 | 0 | return 0; |
375 | 0 | } |
376 | | |
377 | 0 | r = boxer->mediabox; |
378 | 0 | if (horiz) |
379 | 0 | { |
380 | | /* Divider runs horizontally. */ |
381 | | #ifdef DEBUG_WRITE_AS_PS |
382 | | { |
383 | | printf("%% H DIVIDER\n"); |
384 | | printf("1 0 1 setrgbcolor\n"); |
385 | | printf("%g %g moveto\n%g %g lineto\nstroke\n\n", |
386 | | list->list[largest].x0, (list->list[largest].y0 + list->list[largest].y1) * 0.5f, |
387 | | list->list[largest].x1, (list->list[largest].y0 + list->list[largest].y1) * 0.5f); |
388 | | } |
389 | | #endif |
390 | 0 | r.y1 = list->list[largest].y0; |
391 | 0 | analyse_subset(ctx, page, parent, boxer, r, depth); |
392 | |
|
393 | 0 | r.y0 = list->list[largest].y1; |
394 | 0 | r.y1 = boxer->mediabox.y1; |
395 | 0 | analyse_subset(ctx, page, parent, boxer, r, depth); |
396 | 0 | } |
397 | 0 | else |
398 | 0 | { |
399 | | /* Divider runs vertically. */ |
400 | | #ifdef DEBUG_WRITE_AS_PS |
401 | | { |
402 | | printf("%% V DIVIDER\n"); |
403 | | printf("1 0 1 setrgbcolor\n"); |
404 | | printf("%g %g moveto\n%g %g lineto\nstroke\n\n", |
405 | | (list->list[largest].x0 + list->list[largest].x1) * 0.5f, list->list[largest].y0, |
406 | | (list->list[largest].x0 + list->list[largest].x1) * 0.5f, list->list[largest].y1); |
407 | | } |
408 | | #endif |
409 | 0 | r.x1 = list->list[largest].x0; |
410 | 0 | analyse_subset(ctx, page, parent, boxer, r, depth); |
411 | |
|
412 | 0 | r.x0 = list->list[largest].x1; |
413 | 0 | r.x1 = boxer->mediabox.x1; |
414 | 0 | analyse_subset(ctx, page, parent, boxer, r, depth); |
415 | 0 | } |
416 | |
|
417 | 0 | return 1; |
418 | 0 | } |
419 | | |
420 | | #ifdef DEBUG_STRUCT |
421 | | static void |
422 | | do_dump_stext(fz_stext_block *block, int depth) |
423 | | { |
424 | | int d; |
425 | | int idx = -1; |
426 | | |
427 | | while (block) |
428 | | { |
429 | | for (d = 0; d < depth; d++) |
430 | | printf(" "); |
431 | | switch(block->type) |
432 | | { |
433 | | case FZ_STEXT_BLOCK_TEXT: |
434 | | printf("TEXT %p\n", block); |
435 | | break; |
436 | | case FZ_STEXT_BLOCK_IMAGE: |
437 | | printf("IMAGE %p\n", block); |
438 | | break; |
439 | | case FZ_STEXT_BLOCK_VECTOR: |
440 | | printf("VECTOR %p\n", block); |
441 | | break; |
442 | | case FZ_STEXT_BLOCK_STRUCT: |
443 | | printf("STRUCT %p (idx=%d)\n", block, block->u.s.index); |
444 | | assert(block->u.s.index > idx); |
445 | | idx = block->u.s.index; |
446 | | do_dump_stext(block->u.s.down->first_block, depth+1); |
447 | | break; |
448 | | } |
449 | | block = block->next; |
450 | | } |
451 | | } |
452 | | |
453 | | static void |
454 | | dump_stext(char *str, fz_stext_block *block) |
455 | | { |
456 | | printf("%s\n", str); |
457 | | |
458 | | do_dump_stext(block, 0); |
459 | | } |
460 | | #endif |
461 | | |
462 | | static void |
463 | | recalc_bbox(fz_stext_block *block) |
464 | 0 | { |
465 | 0 | fz_rect bbox = fz_empty_rect; |
466 | 0 | fz_stext_line *line; |
467 | |
|
468 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
469 | 0 | bbox = fz_union_rect(bbox, line->bbox); |
470 | |
|
471 | 0 | block->bbox = bbox; |
472 | 0 | } |
473 | | |
474 | | static fz_stext_struct * |
475 | | page_subset(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, fz_rect mediabox) |
476 | 0 | { |
477 | 0 | fz_stext_block *block, *next_block; |
478 | 0 | fz_stext_block *target = NULL; /* The first block in our target list */ |
479 | 0 | fz_stext_block *last = NULL; /* The last block in our target list */ |
480 | 0 | fz_stext_struct *target_parent = NULL; |
481 | 0 | fz_stext_block *after = NULL; /* The block we want to insert after (NULL=start of list) */ |
482 | 0 | fz_stext_block *newblock; |
483 | 0 | int idx = 0; |
484 | 0 | int idx2; |
485 | | #ifdef DEBUG_STRUCT |
486 | | dump_stext("BEFORE", parent ? parent->first_block : page->first_block); |
487 | | #endif |
488 | |
|
489 | 0 | block = parent ? parent->first_block : page->first_block; |
490 | 0 | while (block != NULL) |
491 | 0 | { |
492 | 0 | fz_rect bbox; |
493 | |
|
494 | 0 | next_block = block->next; |
495 | |
|
496 | 0 | bbox = block->bbox; |
497 | | |
498 | | /* Can we take the whole block? */ |
499 | 0 | if (bbox.x0 >= mediabox.x0 && bbox.y0 >= mediabox.y0 && bbox.x1 <= mediabox.x1 && bbox.y1 <= mediabox.y1) |
500 | 0 | { |
501 | | /* Unlink block from the current list. */ |
502 | 0 | if (block->prev) |
503 | 0 | block->prev->next = next_block; |
504 | 0 | else if (parent) |
505 | 0 | parent->first_block = next_block; |
506 | 0 | else |
507 | 0 | page->first_block = next_block; |
508 | 0 | if (next_block) |
509 | 0 | next_block->prev = block->prev; |
510 | 0 | else if (parent) |
511 | 0 | parent->last_block = block->prev; |
512 | 0 | else |
513 | 0 | page->last_block = block->prev; |
514 | | |
515 | | /* Add block onto our target list */ |
516 | 0 | if (target == NULL) |
517 | 0 | { |
518 | 0 | target = block; |
519 | 0 | target_parent = parent; |
520 | 0 | after = block->prev; |
521 | 0 | block->prev = NULL; |
522 | 0 | } |
523 | 0 | else |
524 | 0 | { |
525 | 0 | last->next = block; |
526 | 0 | block->prev = last; |
527 | 0 | } |
528 | 0 | last = block; |
529 | 0 | block->next = NULL; |
530 | 0 | } |
531 | 0 | else if (fz_is_empty_rect(fz_intersect_rect(bbox, mediabox))) |
532 | 0 | { |
533 | 0 | } |
534 | 0 | else if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) |
535 | 0 | { |
536 | 0 | parent = block->u.s.down; |
537 | 0 | next_block = parent->first_block; |
538 | 0 | } |
539 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
540 | 0 | { |
541 | | /* Need to look at the parts. */ |
542 | 0 | fz_stext_line *line, *next_line; |
543 | |
|
544 | 0 | newblock = NULL; |
545 | 0 | for (line = block->u.t.first_line; line != NULL; line = next_line) |
546 | 0 | { |
547 | 0 | next_line = line->next; |
548 | 0 | if (line->bbox.x0 >= mediabox.x0 && line->bbox.y0 >= mediabox.y0 && line->bbox.x1 <= mediabox.x1 && line->bbox.y1 <= mediabox.y1) |
549 | 0 | { |
550 | | /* We need to take this line */ |
551 | 0 | if (newblock == NULL) |
552 | 0 | { |
553 | 0 | newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); |
554 | | |
555 | | /* Add the block onto our target list */ |
556 | 0 | if (target == NULL) |
557 | 0 | { |
558 | 0 | target = newblock; |
559 | 0 | target_parent = parent; |
560 | 0 | if (line == block->u.t.first_line) |
561 | 0 | after = block->prev; |
562 | 0 | else |
563 | 0 | after = block; |
564 | 0 | } |
565 | 0 | else |
566 | 0 | { |
567 | 0 | last->next = newblock; |
568 | 0 | newblock->prev = last; |
569 | 0 | } |
570 | 0 | last = newblock; |
571 | 0 | newblock->id = block->id; |
572 | 0 | } |
573 | | |
574 | | /* Unlink line from the current list. */ |
575 | 0 | if (line->prev) |
576 | 0 | line->prev->next = next_line; |
577 | 0 | else |
578 | 0 | block->u.t.first_line = next_line; |
579 | 0 | if (next_line) |
580 | 0 | next_line->prev = line->prev; |
581 | 0 | else |
582 | 0 | block->u.t.last_line = line->prev; |
583 | | |
584 | | /* Add line onto our new block */ |
585 | 0 | if (newblock->u.t.last_line == NULL) |
586 | 0 | { |
587 | 0 | newblock->u.t.first_line = newblock->u.t.last_line = line; |
588 | 0 | line->prev = NULL; |
589 | 0 | } |
590 | 0 | else |
591 | 0 | { |
592 | 0 | line->prev = newblock->u.t.last_line; |
593 | 0 | newblock->u.t.last_line->next = line; |
594 | 0 | newblock->u.t.last_line = line; |
595 | 0 | } |
596 | 0 | line->next = NULL; |
597 | 0 | } |
598 | 0 | } |
599 | 0 | if (newblock) |
600 | 0 | { |
601 | 0 | recalc_bbox(block); |
602 | 0 | recalc_bbox(newblock); |
603 | 0 | } |
604 | 0 | } |
605 | | |
606 | | /* Step onwards (or upwards) */ |
607 | 0 | block = next_block; |
608 | 0 | while (block == NULL) |
609 | 0 | { |
610 | 0 | if (parent == NULL) |
611 | 0 | break; |
612 | 0 | block = parent->up->next; |
613 | 0 | parent = parent->parent; |
614 | 0 | } |
615 | 0 | } |
616 | | |
617 | | /* If no content to add, bale! */ |
618 | 0 | if (target == NULL) |
619 | 0 | return NULL; |
620 | | |
621 | | /* We want to insert a structure node that contains target after 'after'. */ |
622 | 0 | block = target_parent ? target_parent->first_block : page->first_block; |
623 | 0 | if (after != NULL) |
624 | 0 | { |
625 | 0 | while (1) |
626 | 0 | { |
627 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
628 | 0 | idx = block->u.s.index+1; |
629 | 0 | if (block == after) |
630 | 0 | break; |
631 | 0 | block = block->next; |
632 | 0 | } |
633 | 0 | block = block->next; |
634 | 0 | } |
635 | | /* So we want to insert a structure node with index 'idx' after 'after' */ |
636 | | /* Ensure that the following structure nodes have sane index values */ |
637 | 0 | idx2 = idx+1; |
638 | 0 | for (; block != NULL; block = block->next) |
639 | 0 | { |
640 | 0 | if (block->type != FZ_STEXT_BLOCK_STRUCT) |
641 | 0 | continue; |
642 | 0 | if (block->u.s.index >= idx2) |
643 | 0 | break; |
644 | 0 | block->u.s.index = idx2; |
645 | 0 | idx2++; |
646 | 0 | } |
647 | | |
648 | | /* Convert from 'after' to 'before'. */ |
649 | 0 | if (after) |
650 | 0 | block = after->next; |
651 | 0 | else if (target_parent) |
652 | 0 | block = target_parent->first_block; |
653 | 0 | else |
654 | 0 | block = page->first_block; |
655 | | |
656 | | /* So we want to insert just before block, with index 'idx'. */ |
657 | | |
658 | | /* We are going to need to create a new block. Create a complete unlinked one here. */ |
659 | 0 | newblock = fz_new_stext_struct(ctx, page, FZ_STRUCTURE_DIV, "Split", idx); |
660 | 0 | if (block) |
661 | 0 | newblock->prev = block->prev; |
662 | 0 | else if (target_parent) |
663 | 0 | newblock->prev = target_parent->last_block; |
664 | 0 | else |
665 | 0 | newblock->prev = page->last_block; |
666 | 0 | newblock->next = block; |
667 | 0 | newblock->id = target->id; |
668 | | |
669 | | /* Now insert newblock just before block */ |
670 | | /* If block was first, now we are. */ |
671 | 0 | if (target_parent) |
672 | 0 | { |
673 | 0 | if (target_parent->first_block == block) |
674 | 0 | target_parent->first_block = newblock; |
675 | 0 | } |
676 | 0 | else if (page->first_block == block) |
677 | 0 | page->first_block = newblock; |
678 | 0 | if (block == NULL) |
679 | 0 | { |
680 | | /* Inserting at the end! */ |
681 | 0 | if (target_parent) |
682 | 0 | { |
683 | 0 | if (target_parent->last_block) |
684 | 0 | target_parent->last_block->next = newblock; |
685 | 0 | target_parent->last_block = newblock; |
686 | 0 | } |
687 | 0 | else |
688 | 0 | { |
689 | 0 | if (page->last_block) |
690 | 0 | page->last_block->next = newblock; |
691 | 0 | page->last_block = newblock; |
692 | 0 | } |
693 | 0 | } |
694 | 0 | else |
695 | 0 | { |
696 | 0 | if (block->prev) |
697 | 0 | block->prev->next = newblock; |
698 | 0 | block->prev = newblock; |
699 | 0 | } |
700 | |
|
701 | 0 | newblock->u.s.down->first_block = target; |
702 | 0 | newblock->u.s.down->last_block = last; |
703 | 0 | newblock->u.s.down->parent = target_parent; |
704 | 0 | target->prev = NULL; |
705 | |
|
706 | 0 | for (block = target; block->next != NULL; block = block->next) |
707 | 0 | { |
708 | 0 | newblock->bbox = fz_union_rect(newblock->bbox, block->bbox); |
709 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) |
710 | 0 | block->u.s.down->parent = newblock->u.s.down; |
711 | 0 | } |
712 | |
|
713 | 0 | newblock->bbox = fz_union_rect(newblock->bbox, block->bbox); |
714 | 0 | newblock->u.s.down->last_block = block; |
715 | |
|
716 | | #ifdef DEBUG_STRUCT |
717 | | dump_stext("AFTER", parent ? parent->first_block : page->first_block); |
718 | | #endif |
719 | |
|
720 | 0 | return newblock->u.s.down; |
721 | 0 | } |
722 | | |
723 | | enum { |
724 | | MAX_ANALYSIS_DEPTH = 6 |
725 | | }; |
726 | | |
727 | | static int |
728 | | analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent, boxer_t *big_boxer, int depth) |
729 | 0 | { |
730 | 0 | fz_rect margins; |
731 | 0 | boxer_t *boxer; |
732 | 0 | fz_stext_struct *div; |
733 | 0 | int ret = 0; |
734 | | |
735 | | /* Find the margins in the enclosing boxer. This returns |
736 | | * a subset of the bbox of the original. */ |
737 | 0 | margins = boxer_margins(big_boxer); |
738 | | #ifdef DEBUG_WRITE_AS_PS |
739 | | printf("\n\n%% MARGINS %g %g %g %g\n", margins.x0, margins.y0, margins.x1, margins.y1); |
740 | | #endif |
741 | | |
742 | | /* Now subset the rectangles just to include those that are in our bbox. */ |
743 | 0 | boxer = boxer_subset(ctx, big_boxer, margins); |
744 | |
|
745 | 0 | fz_try(ctx) |
746 | 0 | { |
747 | 0 | div = page_subset(ctx, page, parent, boxer->mediabox); |
748 | | /* If nothing subsetted (no textual content in that region), give up. */ |
749 | 0 | if (div == NULL) |
750 | 0 | break; |
751 | | |
752 | 0 | ret = 1; |
753 | |
|
754 | 0 | if (depth < MAX_ANALYSIS_DEPTH) |
755 | 0 | { |
756 | | /* Can we subdivide that region any more? */ |
757 | 0 | if (boxer_subdivide(ctx, page, div, boxer, depth+1)) |
758 | 0 | break; |
759 | 0 | } |
760 | 0 | } |
761 | 0 | fz_always(ctx) |
762 | 0 | { |
763 | 0 | boxer_destroy(ctx, boxer); |
764 | 0 | } |
765 | 0 | fz_catch(ctx) |
766 | 0 | fz_rethrow(ctx); |
767 | | |
768 | 0 | return ret; |
769 | 0 | } |
770 | | |
771 | | static int |
772 | | line_isnt_all_spaces(fz_context *ctx, fz_stext_line *line) |
773 | 0 | { |
774 | 0 | fz_stext_char *ch; |
775 | 0 | for (ch = line->first_char; ch != NULL; ch = ch->next) |
776 | 0 | if (ch->c != 32 && ch->c != 160) |
777 | 0 | return 1; |
778 | 0 | return 0; |
779 | 0 | } |
780 | | |
781 | | static void |
782 | | feed_line(fz_context *ctx, boxer_t *boxer, fz_stext_line *line) |
783 | 0 | { |
784 | 0 | fz_stext_char *ch; |
785 | |
|
786 | 0 | for (ch = line->first_char; ch != NULL; ch = ch->next) |
787 | 0 | { |
788 | 0 | fz_rect r = fz_empty_rect; |
789 | |
|
790 | 0 | if (ch->c == ' ') |
791 | 0 | continue; |
792 | | |
793 | 0 | do |
794 | 0 | { |
795 | 0 | fz_rect bbox = fz_rect_from_quad(ch->quad); |
796 | 0 | float margin = boxer->tight ? 0 : ch->size/4; |
797 | 0 | bbox.x0 -= margin; |
798 | 0 | bbox.y0 -= margin; |
799 | 0 | bbox.x1 += margin; |
800 | 0 | bbox.y1 += margin; |
801 | 0 | r = fz_union_rect(r, bbox); |
802 | 0 | ch = ch->next; |
803 | 0 | } |
804 | 0 | while (ch != NULL && ch->c != ' '); |
805 | 0 | boxer_feed(ctx, boxer, &r); |
806 | 0 | if (ch == NULL) |
807 | 0 | break; |
808 | 0 | } |
809 | 0 | } |
810 | | |
811 | | /* Internal, non-API function, shared with stext-table. */ |
812 | | fz_rect |
813 | | fz_collate_small_vector_run(fz_stext_block **blockp) |
814 | 0 | { |
815 | 0 | fz_stext_block *block = *blockp; |
816 | 0 | fz_stext_block *next; |
817 | 0 | fz_rect r = block->bbox; |
818 | 0 | int MAX_SIZE = 2; |
819 | 0 | int MAX_GAP = 2; |
820 | |
|
821 | 0 | float w = r.x1 - r.x0; |
822 | 0 | float h = r.y1 - r.y0; |
823 | |
|
824 | 0 | if (w < MAX_SIZE) |
825 | 0 | { |
826 | 0 | while ((next = block->next) != NULL && |
827 | 0 | next->type == FZ_STEXT_BLOCK_VECTOR && |
828 | 0 | next->bbox.x0 == r.x0 && |
829 | 0 | next->bbox.x1 == r.x1 && |
830 | 0 | ((next->bbox.y1 > r.y1 && next->bbox.y0 <= r.y1 + MAX_GAP) || |
831 | 0 | (next->bbox.y0 < r.y0 && next->bbox.y1 >= r.y0 - MAX_GAP))) |
832 | 0 | { |
833 | 0 | r = fz_union_rect(r, next->bbox); |
834 | 0 | block = next; |
835 | 0 | } |
836 | 0 | } |
837 | 0 | if (h < MAX_SIZE) |
838 | 0 | { |
839 | 0 | while ((next = block->next) != NULL && |
840 | 0 | next->type == FZ_STEXT_BLOCK_VECTOR && |
841 | 0 | next->bbox.y0 == r.y0 && |
842 | 0 | next->bbox.y1 == r.y1 && |
843 | 0 | ((next->bbox.x1 > r.x1 && next->bbox.x0 <= r.x1 + MAX_GAP) || |
844 | 0 | (next->bbox.x0 < r.x0 && next->bbox.x1 >= r.x0 - MAX_GAP))) |
845 | 0 | { |
846 | 0 | r = fz_union_rect(r, next->bbox); |
847 | 0 | block = next; |
848 | 0 | } |
849 | 0 | } |
850 | |
|
851 | 0 | *blockp = block; |
852 | |
|
853 | 0 | return r; |
854 | 0 | } |
855 | | |
856 | | static void |
857 | | recurse_and_feed(fz_context *ctx, boxer_t *boxer, fz_stext_block *block) |
858 | 0 | { |
859 | 0 | for (; block != NULL; block = block->next) |
860 | 0 | { |
861 | 0 | fz_stext_line *line; |
862 | 0 | switch (block->type) |
863 | 0 | { |
864 | 0 | case FZ_STEXT_BLOCK_TEXT: |
865 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
866 | 0 | if (line_isnt_all_spaces(ctx, line)) |
867 | 0 | feed_line(ctx, boxer, line); |
868 | 0 | break; |
869 | 0 | case FZ_STEXT_BLOCK_VECTOR: |
870 | 0 | { |
871 | | /* Allow a 1 point margin around vectors to avoid hairline |
872 | | * cracks between supposedly abutting things. */ |
873 | 0 | int VECTOR_MARGIN = 1; |
874 | 0 | fz_rect r = fz_collate_small_vector_run(&block); |
875 | |
|
876 | 0 | r.x0 -= VECTOR_MARGIN; |
877 | 0 | r.y0 -= VECTOR_MARGIN; |
878 | 0 | r.x1 += VECTOR_MARGIN; |
879 | 0 | r.y1 += VECTOR_MARGIN; |
880 | 0 | boxer_feed(ctx, boxer, &r); |
881 | 0 | break; |
882 | 0 | } |
883 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
884 | 0 | if(block->u.s.down) |
885 | 0 | recurse_and_feed(ctx, boxer, block->u.s.down->first_block); |
886 | 0 | break; |
887 | 0 | default: |
888 | 0 | boxer_feed(ctx, boxer, &block->bbox); |
889 | 0 | break; |
890 | 0 | } |
891 | 0 | } |
892 | 0 | } |
893 | | |
894 | | static int |
895 | | segment_rect(fz_context *ctx, fz_rect box, fz_stext_page *page, fz_stext_struct *parent, int tight) |
896 | 0 | { |
897 | 0 | boxer_t *boxer; |
898 | 0 | int ret = 0; |
899 | |
|
900 | 0 | boxer = boxer_create(ctx, &box, tight); |
901 | |
|
902 | 0 | fz_try(ctx) |
903 | 0 | { |
904 | 0 | recurse_and_feed(ctx, boxer, parent ? parent->first_block : page->first_block); |
905 | |
|
906 | 0 | ret = analyse_sub(ctx, page, parent, boxer, 0); |
907 | 0 | } |
908 | 0 | fz_always(ctx) |
909 | 0 | boxer_destroy(ctx, boxer); |
910 | 0 | fz_catch(ctx) |
911 | 0 | fz_rethrow(ctx); |
912 | | |
913 | | #ifdef DEBUG_WRITE_AS_PS |
914 | | printf("showpage\n"); |
915 | | #endif |
916 | | |
917 | 0 | return ret; |
918 | 0 | } |
919 | | |
920 | | int fz_segment_stext_rect(fz_context *ctx, fz_stext_page *page, fz_rect rect) |
921 | 0 | { |
922 | | #ifdef DEBUG_WRITE_AS_PS |
923 | | printf("1 -1 scale 0 -%g translate\n", rect.y1-rect.y0); |
924 | | #endif |
925 | |
|
926 | 0 | return segment_rect(ctx, rect, page, NULL, 1); |
927 | 0 | } |
928 | | |
929 | | int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page) |
930 | 0 | { |
931 | 0 | fz_stext_block *block; |
932 | |
|
933 | 0 | fz_stext_remove_page_fill(ctx, page); |
934 | | |
935 | | /* If we have structure already, give up. We can't hope to beat |
936 | | * proper structure! */ |
937 | 0 | for (block = page->first_block; block != NULL; block = block->next) |
938 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
939 | 0 | return 0; |
940 | | |
941 | | #ifdef DEBUG_WRITE_AS_PS |
942 | | printf("1 -1 scale 0 -%g translate\n", page->mediabox.y1-page->mediabox.y0); |
943 | | #endif |
944 | | |
945 | 0 | return segment_rect(ctx, page->mediabox, page, NULL, 0); |
946 | 0 | } |
947 | | |
948 | | int |
949 | | fz_stext_remove_page_fill(fz_context *ctx, fz_stext_page *page) |
950 | 0 | { |
951 | 0 | fz_stext_page_block_iterator iter; |
952 | 0 | int dropped = 0; |
953 | 0 | fz_rect coverage = fz_empty_rect; |
954 | | |
955 | | /* First, find the area actually covered on the page. */ |
956 | 0 | for (iter = fz_stext_page_block_iterator_begin_dfs(page); !fz_stext_page_block_iterator_eod_dfs(iter); iter = fz_stext_page_block_iterator_next_dfs(iter)) |
957 | 0 | { |
958 | | /* Try to ignore stuff that's completely off screen */ |
959 | 0 | fz_rect bbox = fz_intersect_rect(page->mediabox, iter.block->bbox); |
960 | |
|
961 | 0 | coverage = fz_union_rect(coverage, bbox); |
962 | 0 | } |
963 | | |
964 | | /* Iterate across all the blocks in the page in a depth first order. We'll break out |
965 | | * when we find the first one that is not a white (or transparent) rectangle fill |
966 | | * that covers a significant amount of the page. This therefore copes with several |
967 | | * page fills on the same page. */ |
968 | 0 | for (iter = fz_stext_page_block_iterator_begin_dfs(page); !fz_stext_page_block_iterator_eod_dfs(iter); iter = fz_stext_page_block_iterator_next_dfs(iter)) |
969 | 0 | { |
970 | 0 | fz_rect bbox; |
971 | | |
972 | | /* Stop searching when we find something that's not a vector */ |
973 | 0 | if (iter.block->type != FZ_STEXT_BLOCK_VECTOR) |
974 | 0 | break; |
975 | | |
976 | | /* Stop searching when we find a vector that's not a rectangle */ |
977 | 0 | if ((iter.block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0) |
978 | 0 | break; |
979 | | |
980 | | /* Stop searching when we find a vector that's stroked */ |
981 | 0 | if ((iter.block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED) != 0) |
982 | 0 | break; |
983 | | |
984 | | /* Stop searching when we find a vector that's not white (or invisible) */ |
985 | 0 | if ((iter.block->u.v.argb & 0xff000000) != 0 && (iter.block->u.v.argb & 0xffffff) != 0xffffff) |
986 | 0 | break; |
987 | | |
988 | | /* If we don't cover the coverage area, then we can't be a background. */ |
989 | 0 | bbox = fz_expand_rect(iter.block->bbox, 0.1f); /* Allow for rounding */ |
990 | 0 | if (!fz_contains_rect(bbox, coverage)) |
991 | 0 | break; |
992 | | |
993 | | /* If we don't cover at least 90% of the height/width, then give up. */ |
994 | 0 | if (fz_is_infinite_rect(page->mediabox)) |
995 | 0 | { |
996 | | /* Can't judge. Skip this check. */ |
997 | 0 | } |
998 | 0 | else if ((iter.block->bbox.y1 - iter.block->bbox.y0) < 0.9f * (page->mediabox.y1 - page->mediabox.y0) || |
999 | 0 | (iter.block->bbox.x1 - iter.block->bbox.x0) < 0.9f * (page->mediabox.x1 - page->mediabox.x0)) |
1000 | 0 | { |
1001 | 0 | break; |
1002 | 0 | } |
1003 | | |
1004 | | /* This is a background block. Remove it. This will be the first block |
1005 | | * in the list, so it's simpler. */ |
1006 | 0 | if (iter.parent) |
1007 | 0 | { |
1008 | 0 | iter.parent->first_block = iter.block->next; |
1009 | 0 | if (iter.block->next) |
1010 | 0 | iter.block->next->prev = NULL; |
1011 | 0 | else |
1012 | 0 | iter.parent->last_block = NULL; |
1013 | 0 | } |
1014 | 0 | else |
1015 | 0 | { |
1016 | 0 | iter.page->first_block = iter.block->next; |
1017 | 0 | if (iter.block->next) |
1018 | 0 | iter.block->next->prev = NULL; |
1019 | 0 | else |
1020 | 0 | iter.page->last_block = NULL; |
1021 | 0 | } |
1022 | |
|
1023 | 0 | dropped = 1; |
1024 | 0 | } |
1025 | |
|
1026 | 0 | return dropped; |
1027 | 0 | } |