/src/mupdf/source/fitz/stext-device.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2026 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // You should have received a copy of the GNU Affero General Public License |
15 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
16 | | // |
17 | | // Alternative licensing terms are available from the licensor. |
18 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
19 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
20 | | // CA 94129, USA, for further information. |
21 | | |
22 | | #include "mupdf/fitz.h" |
23 | | |
24 | | #include "glyphbox.h" |
25 | | |
26 | | #include <float.h> |
27 | | #include <string.h> |
28 | | |
29 | | /* Simple layout structure */ |
30 | | |
31 | | fz_layout_block *fz_new_layout(fz_context *ctx) |
32 | 0 | { |
33 | 0 | fz_pool *pool = fz_new_pool(ctx); |
34 | 0 | fz_layout_block *block; |
35 | 0 | fz_try(ctx) |
36 | 0 | { |
37 | 0 | block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block)); |
38 | 0 | block->pool = pool; |
39 | 0 | block->head = NULL; |
40 | 0 | block->tailp = &block->head; |
41 | 0 | } |
42 | 0 | fz_catch(ctx) |
43 | 0 | { |
44 | 0 | fz_drop_pool(ctx, pool); |
45 | 0 | fz_rethrow(ctx); |
46 | 0 | } |
47 | 0 | return block; |
48 | 0 | } |
49 | | |
50 | | void fz_drop_layout(fz_context *ctx, fz_layout_block *block) |
51 | 0 | { |
52 | 0 | if (block) |
53 | 0 | fz_drop_pool(ctx, block->pool); |
54 | 0 | } |
55 | | |
56 | | void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p) |
57 | 0 | { |
58 | 0 | fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line)); |
59 | 0 | line->x = x; |
60 | 0 | line->y = y; |
61 | 0 | line->font_size = font_size; |
62 | 0 | line->p = p; |
63 | 0 | line->text = NULL; |
64 | 0 | line->next = NULL; |
65 | 0 | *block->tailp = line; |
66 | 0 | block->tailp = &line->next; |
67 | 0 | block->text_tailp = &line->text; |
68 | 0 | } |
69 | | |
70 | | void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p) |
71 | 0 | { |
72 | 0 | fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char)); |
73 | 0 | ch->x = x; |
74 | 0 | ch->advance = advance; |
75 | 0 | ch->p = p; |
76 | 0 | ch->next = NULL; |
77 | 0 | *block->text_tailp = ch; |
78 | 0 | block->text_tailp = &ch->next; |
79 | 0 | } |
80 | | |
81 | | /* Extract text into blocks and lines. */ |
82 | | |
83 | 0 | #define PARAGRAPH_DIST 1.5f |
84 | 0 | #define SPACE_DIST 0.15f |
85 | 0 | #define SPACE_MAX_DIST 0.8f |
86 | 0 | #define BASE_MAX_DIST 0.8f |
87 | 0 | #define FAKE_BOLD_MAX_DIST 0.1f |
88 | | |
89 | | /* We keep a stack of the different metatexts that apply at any |
90 | | * given point (normally none!). Whenever we get some content |
91 | | * with a metatext in force, we really want to update the bounds |
92 | | * for that metatext. But running along the whole list each time |
93 | | * would be painful. So we just update the bounds for dev->metatext |
94 | | * and rely on metatext_bounds() propagating it upwards 'just in |
95 | | * time' for us to use metatexts other than the latest one. This |
96 | | * also means we need to propagate bounds upwards when we pop |
97 | | * a metatext. |
98 | | * |
99 | | * Why do we need bounds at all? Well, suppose we get: |
100 | | * /Span <</ActualText (c) >> BDC /Im0 Do EMC |
101 | | * Then where on the page do we put 'c' ? By collecting the |
102 | | * bounds, we can place 'c' wherever the image was. |
103 | | */ |
104 | | typedef struct metatext_t |
105 | | { |
106 | | fz_metatext type; |
107 | | char *text; |
108 | | fz_rect bounds; |
109 | | struct metatext_t *prev; |
110 | | } metatext_t; |
111 | | |
112 | | typedef struct |
113 | | { |
114 | | fz_point from; |
115 | | fz_point to; |
116 | | float thickness; |
117 | | } rect_details; |
118 | | |
119 | | typedef struct |
120 | | { |
121 | | fz_device super; |
122 | | fz_stext_page *page; |
123 | | int id; |
124 | | fz_point pen, start; |
125 | | // maybe_bullet: True if the 'start' position recorded was done so after either some actualtext |
126 | | // on an image, or after a glyph that's known to be used for bullets. This is used to stop us |
127 | | // spotting an 'indented' paragraph, because it's possibly just a bulleted list. |
128 | | int maybe_bullet; |
129 | | fz_point lag_pen; |
130 | | fz_matrix trm; |
131 | | int new_obj; |
132 | | int lastchar; |
133 | | fz_stext_line *lastline; |
134 | | int lastbidi; |
135 | | int flags; |
136 | | int color; |
137 | | int last_was_fake_bold; |
138 | | const fz_text *lasttext; |
139 | | fz_stext_options opts; |
140 | | |
141 | | metatext_t *metatext; |
142 | | |
143 | | /* Store the last values we saw. We need this for flushing the actualtext. */ |
144 | | struct |
145 | | { |
146 | | int valid; |
147 | | int clipped; |
148 | | fz_matrix trm; |
149 | | int wmode; |
150 | | int bidi_level; |
151 | | fz_font *font; |
152 | | int flags; |
153 | | } last; |
154 | | |
155 | | /* The list of 'rects' seen during processing (if we're collecting styles). */ |
156 | | int rect_max; |
157 | | int rect_len; |
158 | | rect_details *rects; |
159 | | |
160 | | fz_stext_block *lazy_vectors; |
161 | | fz_stext_block *lazy_vectors_tail; |
162 | | } fz_stext_device; |
163 | | |
164 | | const char *fz_stext_options_usage = |
165 | | "Structured text options:\n" |
166 | | "\tpreserve-images: keep images in output\n" |
167 | | "\tpreserve-ligatures: do not expand ligatures into constituent characters\n" |
168 | | "\tpreserve-spans: do not merge spans on the same line\n" |
169 | | "\tpreserve-whitespace: do not convert all whitespace into space characters\n" |
170 | | "\tinhibit-spaces: don't add spaces between gaps in the text\n" |
171 | | "\tparagraph-break: break blocks at paragraph boundaries\n" |
172 | | "\tdehyphenate: attempt to join up hyphenated words\n" |
173 | | "\tignore-actualtext: do not apply ActualText replacements\n" |
174 | | "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n" |
175 | | "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n" |
176 | | "\taccurate-bboxes: calculate char bboxes from the outlines\n" |
177 | | "\taccurate-ascenders: calculate ascender/descender from font glyphs\n" |
178 | | "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n" |
179 | | "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n" |
180 | | "\tclip: do not include text that is completely clipped\n" |
181 | | "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n" |
182 | | "\tstructured: collect structure markup\n" |
183 | | "\tvectors: include vector bboxes in output\n" |
184 | | "\tsegment: attempt to segment the page\n" |
185 | | "\ttable-hunt: hunt for tables within a (segmented) page\n" |
186 | | "\tresolution: resolution to render at\n" |
187 | | "\n"; |
188 | | |
189 | | /* Find the current actualtext, if any. Will abort if dev == NULL. */ |
190 | | static metatext_t * |
191 | | find_actualtext(fz_stext_device *dev) |
192 | 0 | { |
193 | 0 | metatext_t *mt = dev->metatext; |
194 | |
|
195 | 0 | while (mt && mt->type != FZ_METATEXT_ACTUALTEXT) |
196 | 0 | mt = mt->prev; |
197 | |
|
198 | 0 | return mt; |
199 | 0 | } |
200 | | |
201 | | /* Find the bounds of the given metatext. Will abort if mt or |
202 | | * dev are NULL. */ |
203 | | static fz_rect * |
204 | | metatext_bounds(metatext_t *mt, fz_stext_device *dev) |
205 | 0 | { |
206 | 0 | metatext_t *mt2 = dev->metatext; |
207 | |
|
208 | 0 | while (mt2 != mt) |
209 | 0 | { |
210 | 0 | mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds); |
211 | 0 | mt2 = mt2->prev; |
212 | 0 | } |
213 | |
|
214 | 0 | return &mt->bounds; |
215 | 0 | } |
216 | | |
217 | | /* Find the bounds of the current actualtext, or NULL if there |
218 | | * isn't one. Will abort if dev is NULL. */ |
219 | | static fz_rect * |
220 | | actualtext_bounds(fz_stext_device *dev) |
221 | 0 | { |
222 | 0 | metatext_t *mt = find_actualtext(dev); |
223 | |
|
224 | 0 | if (mt == NULL) |
225 | 0 | return NULL; |
226 | | |
227 | 0 | return metatext_bounds(mt, dev); |
228 | 0 | } |
229 | | |
230 | | fz_stext_page * |
231 | | fz_new_stext_page(fz_context *ctx, fz_rect mediabox) |
232 | 0 | { |
233 | 0 | fz_pool *pool = fz_new_pool(ctx); |
234 | 0 | fz_stext_page *page = NULL; |
235 | 0 | fz_try(ctx) |
236 | 0 | { |
237 | 0 | page = fz_pool_alloc(ctx, pool, sizeof(*page)); |
238 | 0 | page->refs = 1; |
239 | 0 | page->pool = pool; |
240 | 0 | page->mediabox = mediabox; |
241 | 0 | page->first_block = NULL; |
242 | 0 | page->last_block = NULL; |
243 | 0 | page->id_list = fz_new_pool_array(ctx, pool, fz_stext_page_details, 4); |
244 | 0 | } |
245 | 0 | fz_catch(ctx) |
246 | 0 | { |
247 | 0 | fz_drop_pool(ctx, pool); |
248 | 0 | fz_rethrow(ctx); |
249 | 0 | } |
250 | 0 | return page; |
251 | 0 | } |
252 | | |
253 | | static void |
254 | | drop_run(fz_context *ctx, fz_stext_block *block) |
255 | 0 | { |
256 | 0 | fz_stext_line *line; |
257 | 0 | fz_stext_char *ch; |
258 | 0 | while (block) |
259 | 0 | { |
260 | 0 | switch (block->type) |
261 | 0 | { |
262 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
263 | 0 | fz_drop_image(ctx, block->u.i.image); |
264 | 0 | break; |
265 | 0 | case FZ_STEXT_BLOCK_TEXT: |
266 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
267 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
268 | 0 | fz_drop_font(ctx, ch->font); |
269 | 0 | break; |
270 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
271 | 0 | drop_run(ctx, block->u.s.down->first_block); |
272 | 0 | break; |
273 | 0 | default: |
274 | 0 | break; |
275 | 0 | } |
276 | 0 | block = block->next; |
277 | 0 | } |
278 | 0 | } |
279 | | |
280 | | fz_stext_page_details *fz_stext_page_details_for_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block) |
281 | 0 | { |
282 | 0 | if (block == NULL || page == NULL) |
283 | 0 | fz_throw(ctx, FZ_ERROR_ARGUMENT, "page details require a page and a block"); |
284 | | |
285 | 0 | return (fz_stext_page_details *)fz_pool_array_lookup(ctx, page->id_list, block->id); |
286 | 0 | } |
287 | | |
288 | | fz_stext_page * |
289 | | fz_keep_stext_page(fz_context *ctx, fz_stext_page *page) |
290 | 0 | { |
291 | 0 | return fz_keep_imp(ctx, page, &page->refs); |
292 | 0 | } |
293 | | |
294 | | void |
295 | | fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) |
296 | 0 | { |
297 | 0 | if (page == NULL) |
298 | 0 | return; |
299 | | |
300 | 0 | if (fz_drop_imp(ctx, page, &page->refs)) |
301 | 0 | { |
302 | 0 | drop_run(ctx, page->first_block); |
303 | 0 | fz_drop_pool(ctx, page->pool); |
304 | 0 | } |
305 | 0 | } |
306 | | |
307 | | /* |
308 | | * This adds a new block at the end of the page. This should not be used |
309 | | * to add 'struct' blocks to the page as those have to be added internally, |
310 | | * with more complicated pointer setup. |
311 | | */ |
312 | | static fz_stext_block * |
313 | | add_block_to_page(fz_context *ctx, fz_stext_page *page, int type, int id) |
314 | 0 | { |
315 | 0 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
316 | 0 | block->bbox = fz_empty_rect; /* Fixes bug 703267. */ |
317 | 0 | block->prev = page->last_block; |
318 | 0 | block->type = type; |
319 | 0 | block->id = id; |
320 | 0 | if (page->last_struct) |
321 | 0 | { |
322 | 0 | if (page->last_struct->last_block) |
323 | 0 | { |
324 | 0 | block->prev = page->last_struct->last_block; |
325 | 0 | block->prev->next = block; |
326 | 0 | page->last_struct->last_block = block; |
327 | 0 | } |
328 | 0 | else |
329 | 0 | page->last_struct->last_block = page->last_struct->first_block = block; |
330 | 0 | } |
331 | 0 | else if (!page->last_block) |
332 | 0 | { |
333 | 0 | assert(!page->first_block); |
334 | 0 | page->first_block = page->last_block = block; |
335 | 0 | } |
336 | 0 | else |
337 | 0 | { |
338 | 0 | page->last_block->next = block; |
339 | 0 | page->last_block = block; |
340 | 0 | } |
341 | 0 | return block; |
342 | 0 | } |
343 | | |
344 | | static fz_stext_block * |
345 | | add_lazy_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, int id) |
346 | 0 | { |
347 | 0 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
348 | 0 | block->bbox = fz_empty_rect; |
349 | 0 | block->prev = tdev->lazy_vectors_tail; |
350 | 0 | block->type = FZ_STEXT_BLOCK_VECTOR; |
351 | 0 | block->id = id; |
352 | |
|
353 | 0 | if (tdev->lazy_vectors == NULL) |
354 | 0 | tdev->lazy_vectors = block; |
355 | 0 | else |
356 | 0 | tdev->lazy_vectors_tail->next = block; |
357 | 0 | tdev->lazy_vectors_tail = block; |
358 | |
|
359 | 0 | return block; |
360 | 0 | } |
361 | | |
362 | | static void |
363 | | flush_lazy_vectors(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev) |
364 | 0 | { |
365 | 0 | if (tdev->lazy_vectors == NULL) |
366 | 0 | return; |
367 | | |
368 | 0 | if (page->last_struct) |
369 | 0 | { |
370 | 0 | if (page->last_struct->last_block) |
371 | 0 | { |
372 | 0 | page->last_struct->last_block->next = tdev->lazy_vectors; |
373 | 0 | tdev->lazy_vectors->prev = page->last_struct->last_block; |
374 | 0 | page->last_struct->last_block = tdev->lazy_vectors_tail; |
375 | 0 | } |
376 | 0 | else |
377 | 0 | { |
378 | 0 | page->last_struct->first_block = tdev->lazy_vectors; |
379 | 0 | page->last_struct->last_block = tdev->lazy_vectors_tail; |
380 | 0 | } |
381 | 0 | } |
382 | 0 | else if (!page->last_block) |
383 | 0 | { |
384 | 0 | page->first_block = tdev->lazy_vectors; |
385 | 0 | page->last_block = tdev->lazy_vectors_tail; |
386 | 0 | } |
387 | 0 | else |
388 | 0 | { |
389 | 0 | page->last_block->next = tdev->lazy_vectors; |
390 | 0 | tdev->lazy_vectors->prev = page->last_block; |
391 | 0 | page->last_block = tdev->lazy_vectors_tail; |
392 | 0 | } |
393 | |
|
394 | 0 | tdev->lazy_vectors = tdev->lazy_vectors_tail = NULL; |
395 | 0 | } |
396 | | |
397 | | static fz_stext_block * |
398 | | add_text_block_to_page(fz_context *ctx, fz_stext_page *page, int id) |
399 | 0 | { |
400 | 0 | return add_block_to_page(ctx, page, FZ_STEXT_BLOCK_TEXT, id); |
401 | 0 | } |
402 | | |
403 | | static fz_stext_block * |
404 | | add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image, int id) |
405 | 0 | { |
406 | 0 | fz_stext_block *block = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_IMAGE, id); |
407 | 0 | block->u.i.transform = ctm; |
408 | 0 | block->u.i.image = fz_keep_image(ctx, image); |
409 | 0 | block->bbox = fz_transform_rect(fz_unit_rect, ctm); |
410 | 0 | return block; |
411 | 0 | } |
412 | | |
413 | | static fz_stext_line * |
414 | | add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi) |
415 | 0 | { |
416 | 0 | fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); |
417 | 0 | line->prev = block->u.t.last_line; |
418 | 0 | if (!block->u.t.first_line) |
419 | 0 | block->u.t.first_line = block->u.t.last_line = line; |
420 | 0 | else |
421 | 0 | { |
422 | 0 | block->u.t.last_line->next = line; |
423 | 0 | block->u.t.last_line = line; |
424 | 0 | } |
425 | |
|
426 | 0 | line->dir = *dir; |
427 | 0 | line->wmode = wmode; |
428 | |
|
429 | 0 | return line; |
430 | 0 | } |
431 | | |
432 | 0 | #define NON_ACCURATE_GLYPH_ADDED_SPACE (-2) |
433 | 0 | #define NON_ACCURATE_GLYPH (-1) |
434 | | |
435 | | static fz_stext_char * |
436 | | add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags) |
437 | 0 | { |
438 | 0 | fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); |
439 | 0 | fz_point a, d; |
440 | |
|
441 | 0 | if (!line->first_char) |
442 | 0 | line->first_char = line->last_char = ch; |
443 | 0 | else |
444 | 0 | { |
445 | 0 | line->last_char->next = ch; |
446 | 0 | line->last_char = ch; |
447 | 0 | } |
448 | |
|
449 | 0 | ch->c = c; |
450 | 0 | ch->argb = color; |
451 | 0 | ch->bidi = bidi; |
452 | 0 | ch->origin = *p; |
453 | 0 | ch->size = size; |
454 | 0 | ch->font = fz_keep_font(ctx, font); |
455 | 0 | ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0) | (synthetic > 1 ? FZ_STEXT_SYNTHETIC_LARGE : 0); |
456 | 0 | if (font->flags.is_bold) |
457 | 0 | ch->flags |= FZ_STEXT_BOLD; |
458 | |
|
459 | 0 | if (line->wmode == 0) |
460 | 0 | { |
461 | 0 | fz_rect bounds; |
462 | 0 | int bounded = 0; |
463 | 0 | a.x = 0; |
464 | 0 | d.x = 0; |
465 | 0 | if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE) |
466 | 0 | { |
467 | | /* Added space, in accurate mode. */ |
468 | 0 | a.y = d.y = 0; |
469 | 0 | } |
470 | 0 | else if (glyph == NON_ACCURATE_GLYPH) |
471 | 0 | { |
472 | | /* Non accurate mode. */ |
473 | 0 | a.y = fz_font_ascender(ctx, font); |
474 | 0 | d.y = fz_font_descender(ctx, font); |
475 | 0 | } |
476 | 0 | else |
477 | 0 | { |
478 | | /* Any glyph in accurate mode */ |
479 | 0 | bounds = fz_bound_glyph(ctx, font, glyph, fz_identity); |
480 | 0 | bounded = 1; |
481 | 0 | a.y = bounds.y1; |
482 | 0 | d.y = bounds.y0; |
483 | 0 | } |
484 | 0 | if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS) |
485 | 0 | { |
486 | 0 | if (!bounded) |
487 | 0 | bounds = fz_bound_glyph(ctx, font, glyph, fz_identity); |
488 | 0 | if (a.x > bounds.x0) |
489 | 0 | a.x = bounds.x0; |
490 | 0 | if (d.y < bounds.x1) |
491 | 0 | d.y = bounds.x1; |
492 | 0 | } |
493 | 0 | } |
494 | 0 | else |
495 | 0 | { |
496 | 0 | a.x = 1; |
497 | 0 | d.x = 0; |
498 | 0 | a.y = 0; |
499 | 0 | d.y = 0; |
500 | 0 | } |
501 | 0 | a = fz_transform_vector(a, trm); |
502 | 0 | d = fz_transform_vector(d, trm); |
503 | |
|
504 | 0 | ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y); |
505 | 0 | ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y); |
506 | 0 | ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y); |
507 | 0 | ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y); |
508 | |
|
509 | 0 | return ch; |
510 | 0 | } |
511 | | |
512 | | static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail) |
513 | 0 | { |
514 | 0 | fz_stext_char *prev, *next; |
515 | 0 | prev = tail; |
516 | 0 | while (curr != tail) |
517 | 0 | { |
518 | 0 | next = curr->next; |
519 | 0 | curr->next = prev; |
520 | 0 | prev = curr; |
521 | 0 | curr = next; |
522 | 0 | } |
523 | 0 | return prev; |
524 | 0 | } |
525 | | |
526 | | static void reverse_bidi_line(fz_stext_line *line) |
527 | 0 | { |
528 | 0 | fz_stext_char *a, *b, **prev; |
529 | 0 | prev = &line->first_char; |
530 | 0 | for (a = line->first_char; a; a = a->next) |
531 | 0 | { |
532 | 0 | if (a->bidi) |
533 | 0 | { |
534 | 0 | b = a; |
535 | 0 | while (b->next && b->next->bidi) |
536 | 0 | b = b->next; |
537 | 0 | if (a != b) |
538 | 0 | *prev = reverse_bidi_span(a, b->next); |
539 | 0 | } |
540 | 0 | prev = &a->next; |
541 | 0 | line->last_char = a; |
542 | 0 | } |
543 | 0 | } |
544 | | |
545 | | int fz_is_unicode_hyphen(int c) |
546 | 0 | { |
547 | | /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */ |
548 | 0 | return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011); |
549 | 0 | } |
550 | | |
551 | | static float |
552 | | vec_dot(const fz_point *a, const fz_point *b) |
553 | 0 | { |
554 | 0 | return a->x * b->x + a->y * b->y; |
555 | 0 | } |
556 | | |
557 | | static int may_add_space(int lastchar) |
558 | 0 | { |
559 | | /* Basic latin, greek, cyrillic, hebrew, arabic, |
560 | | * general punctuation, |
561 | | * superscripts and subscripts, |
562 | | * and currency symbols. |
563 | | */ |
564 | 0 | return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF))); |
565 | 0 | } |
566 | | |
567 | 0 | #define FAKEBOLD_THRESHOLD_RECIP (1.0f / FAKE_BOLD_MAX_DIST) |
568 | | |
569 | | static int |
570 | | is_within_fake_bold_distance(float a, float b, float size) |
571 | 0 | { |
572 | 0 | a -= b; |
573 | 0 | if (a < 0) |
574 | 0 | a = -a; |
575 | |
|
576 | 0 | return FAKEBOLD_THRESHOLD_RECIP * a < size; |
577 | 0 | } |
578 | | |
579 | | static int |
580 | | font_equiv(fz_context *ctx, fz_font *f, fz_font *g) |
581 | 0 | { |
582 | 0 | unsigned char fdigest[16]; |
583 | 0 | unsigned char gdigest[16]; |
584 | |
|
585 | 0 | if (f == g) |
586 | 0 | return 1; |
587 | | |
588 | 0 | if (strcmp(f->name, g->name) != 0) |
589 | 0 | return 0; |
590 | | |
591 | 0 | if (f->buffer == NULL || g->buffer == NULL) |
592 | 0 | return 0; |
593 | | |
594 | 0 | fz_font_digest(ctx, f, fdigest); |
595 | 0 | fz_font_digest(ctx, g, gdigest); |
596 | |
|
597 | 0 | return (memcmp(fdigest, gdigest, 16) == 0); |
598 | 0 | } |
599 | | |
600 | | static int |
601 | | check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags) |
602 | 0 | { |
603 | 0 | fz_stext_line *line; |
604 | 0 | fz_stext_char *ch; |
605 | |
|
606 | 0 | for (; block != NULL; block = block->next) |
607 | 0 | { |
608 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
609 | 0 | { |
610 | 0 | if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags)) |
611 | 0 | return 1; |
612 | 0 | } |
613 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
614 | 0 | { |
615 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
616 | 0 | { |
617 | 0 | fz_stext_char *pr = NULL; |
618 | 0 | for (ch = line->first_char; ch != NULL; ch = ch->next) |
619 | 0 | { |
620 | | /* Not perfect, but it'll do! */ |
621 | 0 | if (ch->c == c && is_within_fake_bold_distance(ch->origin.x, p.x, size) && is_within_fake_bold_distance(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font)) |
622 | 0 | { |
623 | | /* If we were filled before, and we are stroking now... */ |
624 | 0 | if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED && |
625 | 0 | (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED) |
626 | 0 | { |
627 | | /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */ |
628 | 0 | ch->flags |= flags; |
629 | 0 | return 1; |
630 | 0 | } |
631 | | /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these |
632 | | * as boldening if either the char before, or the char after were also boldened. */ |
633 | 0 | ch->flags |= flags; |
634 | |
|
635 | 0 | if (c == ' ') |
636 | 0 | { |
637 | 0 | if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) || |
638 | 0 | (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0)) |
639 | 0 | { |
640 | | /* OK, we can be bold. */ |
641 | 0 | ch->flags |= FZ_STEXT_BOLD; |
642 | 0 | } |
643 | | /* Whether we have recorded this as being bold or not, still |
644 | | * claim we did, so we swallow the space and don't reemit it. */ |
645 | 0 | return 1; |
646 | 0 | } |
647 | 0 | else |
648 | 0 | { |
649 | 0 | ch->flags |= FZ_STEXT_BOLD; |
650 | 0 | return 1; |
651 | 0 | } |
652 | 0 | } |
653 | 0 | pr = ch; |
654 | 0 | } |
655 | 0 | } |
656 | 0 | } |
657 | 0 | } |
658 | | |
659 | 0 | return 0; |
660 | 0 | } |
661 | | |
662 | | static int |
663 | | plausible_bullet(int c) |
664 | 0 | { |
665 | 0 | return (c == '*' || |
666 | 0 | c == 0x00B7 || /* Middle Dot */ |
667 | 0 | c == 0x2022 || /* Bullet */ |
668 | 0 | c == 0x2023 || /* Triangular Bullet */ |
669 | 0 | c == 0x2043 || /* Hyphen Bullet */ |
670 | 0 | c == 0x204C || /* Back leftwards bullet */ |
671 | 0 | c == 0x204D || /* Back rightwards bullet */ |
672 | 0 | c == 0x2219 || /* Bullet operator */ |
673 | 0 | c == 0x25C9 || /* Fisheye */ |
674 | 0 | c == 0x25CB || /* White circle */ |
675 | 0 | c == 0x25CF || /* Black circle */ |
676 | 0 | c == 0x25D8 || /* Inverse Bullet */ |
677 | 0 | c == 0x25E6 || /* White Bullet */ |
678 | 0 | c == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */ |
679 | 0 | c == 0x261a || /* Black left pointing index */ |
680 | 0 | c == 0x261b || /* Black right pointing index */ |
681 | 0 | c == 0x261c || /* White left pointing index */ |
682 | 0 | c == 0x261d || /* White up pointing index */ |
683 | 0 | c == 0x261e || /* White right pointing index */ |
684 | 0 | c == 0x261f || /* White down pointing index */ |
685 | 0 | c == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */ |
686 | 0 | c == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */ |
687 | 0 | c == 0x29BE || /* Circled White Bullet */ |
688 | 0 | c == 0x29BF || /* Circled Bullet */ |
689 | 0 | c == 0x2660 || /* Black Spade suit */ |
690 | 0 | c == 0x2661 || /* White Heart suit */ |
691 | 0 | c == 0x2662 || /* White Diamond suit */ |
692 | 0 | c == 0x2663 || /* Black Club suit */ |
693 | 0 | c == 0x2664 || /* White Spade suit */ |
694 | 0 | c == 0x2665 || /* Black Heart suit */ |
695 | 0 | c == 0x2666 || /* Black Diamond suit */ |
696 | 0 | c == 0x2667 || /* White Clud suit */ |
697 | 0 | c == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */ |
698 | 0 | c == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */ |
699 | 0 | c == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */ |
700 | 0 | c == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */ |
701 | 0 | c == 0x1f597 || /* White down pointing left hand index */ |
702 | 0 | c == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */ |
703 | 0 | c == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */ |
704 | 0 | c == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */ |
705 | 0 | c == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */ |
706 | 0 | c == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */ |
707 | 0 | c == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */ |
708 | 0 | c == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */ |
709 | 0 | c == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */ |
710 | 0 | c == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */ |
711 | 0 | c == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */ |
712 | 0 | c == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */ |
713 | 0 | c == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */ |
714 | 0 | c == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */ |
715 | 0 | c == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */ |
716 | 0 | c == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */ |
717 | 0 | c == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */ |
718 | 0 | 0); |
719 | 0 | } |
720 | | |
721 | | static void |
722 | | fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags) |
723 | 0 | { |
724 | 0 | fz_stext_page *page = dev->page; |
725 | 0 | fz_stext_block *cur_block; |
726 | 0 | fz_stext_line *cur_line = NULL; |
727 | |
|
728 | 0 | int new_para = 0; |
729 | 0 | int new_line = 1; |
730 | 0 | int add_space = 0; |
731 | 0 | fz_point dir, ndir, p, q; |
732 | 0 | float size; |
733 | 0 | fz_point delta; |
734 | 0 | float spacing = 0; |
735 | 0 | float base_offset = 0; |
736 | 0 | float dist; |
737 | | |
738 | | /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */ |
739 | 0 | bidi = bidi & 1; |
740 | | |
741 | | /* dir = direction vector for motion. ndir = normalised(dir) */ |
742 | 0 | if (wmode == 0) |
743 | 0 | { |
744 | 0 | dir.x = 1; |
745 | 0 | dir.y = 0; |
746 | 0 | } |
747 | 0 | else |
748 | 0 | { |
749 | 0 | dir.x = 0; |
750 | 0 | dir.y = -1; |
751 | 0 | } |
752 | 0 | dir = fz_transform_vector(dir, trm); |
753 | 0 | ndir = fz_normalize_vector(dir); |
754 | |
|
755 | 0 | size = fz_matrix_expansion(trm); |
756 | | |
757 | | /* We need to identify where glyphs 'start' (p) and 'stop' (q). |
758 | | * Each glyph holds its 'start' position, and the next glyph in the |
759 | | * span (or span->max if there is no next glyph) holds its 'end' |
760 | | * position. |
761 | | * |
762 | | * For both horizontal and vertical motion, trm->{e,f} gives the |
763 | | * origin (usually the bottom left) of the glyph. |
764 | | * |
765 | | * In horizontal mode: |
766 | | * + p is bottom left. |
767 | | * + q is the bottom right |
768 | | * In vertical mode: |
769 | | * + p is top left (where it advanced from) |
770 | | * + q is bottom left |
771 | | */ |
772 | 0 | if (wmode == 0) |
773 | 0 | { |
774 | 0 | p.x = trm.e; |
775 | 0 | p.y = trm.f; |
776 | 0 | q.x = trm.e + adv * dir.x; |
777 | 0 | q.y = trm.f + adv * dir.y; |
778 | 0 | } |
779 | 0 | else |
780 | 0 | { |
781 | 0 | p.x = trm.e - adv * dir.x; |
782 | 0 | p.y = trm.f - adv * dir.y; |
783 | 0 | q.x = trm.e; |
784 | 0 | q.y = trm.f; |
785 | 0 | } |
786 | | |
787 | | //printf("%g,%g \"%c\" %g,%g\n", p.x, p.y, c, q.x, q.y); |
788 | |
|
789 | 0 | if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0) |
790 | 0 | { |
791 | 0 | if (glyph == -1) |
792 | 0 | { |
793 | 0 | if (dev->last_was_fake_bold) |
794 | 0 | return; |
795 | 0 | } |
796 | 0 | else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags)) |
797 | 0 | { |
798 | 0 | dev->last_was_fake_bold = 1; |
799 | 0 | return; |
800 | 0 | } |
801 | 0 | dev->last_was_fake_bold = 0; |
802 | 0 | } |
803 | | |
804 | | /* Find current position to enter new text. */ |
805 | 0 | cur_block = page->last_struct ? page->last_struct->last_block : page->last_block; |
806 | 0 | if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) |
807 | 0 | cur_block = NULL; |
808 | 0 | cur_line = cur_block ? cur_block->u.t.last_line : NULL; |
809 | | |
810 | | /* We use glyph == -2 to indicate a no-glyph char from an actualtext. The position |
811 | | * is valid though, so we want to advance the pen for these. */ |
812 | 0 | if (cur_line && glyph == -1) |
813 | 0 | { |
814 | | /* Don't advance pen or break lines for no-glyph characters in a cluster */ |
815 | 0 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags); |
816 | 0 | dev->lastbidi = bidi; |
817 | 0 | dev->lastchar = c; |
818 | 0 | dev->lastline = cur_line; |
819 | 0 | return; |
820 | 0 | } |
821 | | |
822 | 0 | if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f) |
823 | 0 | { |
824 | | /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), |
825 | | * then we can't append to the current block/line. */ |
826 | 0 | new_para = 1; |
827 | 0 | new_line = 1; |
828 | 0 | } |
829 | 0 | else |
830 | 0 | { |
831 | | /* Detect fake bold where text is printed twice in the same place. */ |
832 | | /* Largely supplanted by the check_for_fake_bold mechanism above, |
833 | | * but we leave this in for backward compatibility as it's cheap, |
834 | | * and works even when FZ_STEXT_COLLECT_STYLES is not set. */ |
835 | 0 | dist = hypotf(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y) / size; |
836 | | /* This can trigger improperly for glyphs that come from actualtext |
837 | | * as they are frequently overlaid. Therefore rely on glyph >= 0. */ |
838 | 0 | if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar && glyph >= 0) |
839 | 0 | return; |
840 | | |
841 | | /* Calculate how far we've moved since the last character. */ |
842 | 0 | delta.x = p.x - dev->pen.x; |
843 | 0 | delta.y = p.y - dev->pen.y; |
844 | | |
845 | | /* The transform has not changed, so we know we're in the same |
846 | | * direction. Calculate 2 distances; how far off the previous |
847 | | * baseline we are, together with how far along the baseline |
848 | | * we are from the expected position. */ |
849 | 0 | spacing = (ndir.x * delta.x + ndir.y * delta.y) / size; |
850 | 0 | base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size; |
851 | | |
852 | | /* Only a small amount off the baseline - we'll take this */ |
853 | 0 | if (fabsf(base_offset) < BASE_MAX_DIST) |
854 | 0 | { |
855 | | /* If mixed LTR and RTL content */ |
856 | 0 | if ((bidi & 1) != (dev->lastbidi & 1)) |
857 | 0 | { |
858 | | /* Ignore jumps within line when switching between LTR and RTL text. */ |
859 | 0 | new_line = 0; |
860 | 0 | } |
861 | | |
862 | | /* RTL */ |
863 | 0 | else if (bidi & 1) |
864 | 0 | { |
865 | 0 | fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y); |
866 | 0 | float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv; |
867 | | |
868 | | /* If the pen is where we would have been if we |
869 | | * had advanced backwards from the previous |
870 | | * character by this character's advance, we |
871 | | * are probably seeing characters emitted in |
872 | | * logical order. |
873 | | */ |
874 | 0 | if (fabsf(logical_spacing) < SPACE_DIST) |
875 | 0 | { |
876 | 0 | new_line = 0; |
877 | 0 | } |
878 | | |
879 | | /* However, if the pen has advanced to where we would expect it |
880 | | * in an LTR context, we're seeing them emitted in visual order |
881 | | * and should flag them for reordering! |
882 | | */ |
883 | 0 | else if (fabsf(spacing) < SPACE_DIST) |
884 | 0 | { |
885 | 0 | bidi = 3; /* mark line as visual */ |
886 | 0 | new_line = 0; |
887 | 0 | } |
888 | | |
889 | | /* And any other small jump could be a missing space. */ |
890 | 0 | else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST) |
891 | 0 | { |
892 | 0 | if (wmode == 0 && may_add_space(dev->lastchar)) |
893 | 0 | add_space = 1; |
894 | 0 | new_line = 0; |
895 | 0 | } |
896 | 0 | else if (spacing < 0 && spacing > -SPACE_MAX_DIST) |
897 | 0 | { |
898 | | /* Motion is in line, but negative. We've probably got overlapping |
899 | | * chars here. Live with it. */ |
900 | 0 | new_line = 0; |
901 | 0 | } |
902 | 0 | else if (spacing > 0 && spacing < SPACE_MAX_DIST) |
903 | 0 | { |
904 | 0 | bidi = 3; /* mark line as visual */ |
905 | 0 | if (wmode == 0 && may_add_space(dev->lastchar)) |
906 | 0 | add_space = 1 + (spacing > SPACE_DIST*2); |
907 | 0 | new_line = 0; |
908 | 0 | } |
909 | | |
910 | 0 | else |
911 | 0 | { |
912 | | /* Motion is large and unexpected (probably a new table column). */ |
913 | 0 | new_line = 1; |
914 | 0 | } |
915 | 0 | } |
916 | | |
917 | | /* LTR or neutral character */ |
918 | 0 | else |
919 | 0 | { |
920 | 0 | if (fabsf(spacing) < SPACE_DIST) |
921 | 0 | { |
922 | | /* Motion is in line and small enough to ignore. */ |
923 | 0 | new_line = 0; |
924 | 0 | } |
925 | 0 | else if (spacing < 0 && spacing > -SPACE_MAX_DIST) |
926 | 0 | { |
927 | | /* Motion is in line, but negative. We've probably got overlapping |
928 | | * chars here. Live with it. */ |
929 | 0 | new_line = 0; |
930 | 0 | } |
931 | 0 | else if (spacing > 0 && spacing < SPACE_MAX_DIST) |
932 | 0 | { |
933 | | /* Motion is forward in line and large enough to warrant us adding a space. */ |
934 | 0 | if (wmode == 0 && may_add_space(dev->lastchar)) |
935 | 0 | add_space = 1 + (spacing > SPACE_DIST*2); |
936 | 0 | new_line = 0; |
937 | 0 | } |
938 | 0 | else |
939 | 0 | { |
940 | | /* Motion is large and unexpected (probably a new table column). */ |
941 | 0 | new_line = 1; |
942 | 0 | } |
943 | 0 | } |
944 | 0 | } |
945 | | |
946 | | /* Enough for a new line, but not enough for a new paragraph */ |
947 | 0 | else if (fabsf(base_offset) <= PARAGRAPH_DIST) |
948 | 0 | { |
949 | | /* Check indent to spot text-indent style paragraphs */ |
950 | 0 | if (wmode == 0 && cur_line && dev->new_obj) |
951 | 0 | if ((p.x - dev->start.x) > 0.5f && !dev->maybe_bullet) |
952 | 0 | new_para = 1; |
953 | 0 | new_line = 1; |
954 | 0 | } |
955 | | |
956 | | /* Way off the baseline - open a new paragraph */ |
957 | 0 | else |
958 | 0 | { |
959 | 0 | new_para = 1; |
960 | 0 | new_line = 1; |
961 | 0 | } |
962 | 0 | } |
963 | | |
964 | | /* Start a new block (but only at the beginning of a text object) */ |
965 | 0 | if (new_para || !cur_block) |
966 | 0 | { |
967 | 0 | flush_lazy_vectors(ctx, page, dev); |
968 | 0 | cur_block = add_text_block_to_page(ctx, page, dev->id); |
969 | 0 | cur_line = cur_block->u.t.last_line; |
970 | 0 | } |
971 | |
|
972 | 0 | if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(dev->lastchar) && dev->lastline != NULL) |
973 | 0 | dev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED; |
974 | | |
975 | | /* Start a new line */ |
976 | 0 | if (new_line || !cur_line || force_new_line) |
977 | 0 | { |
978 | 0 | cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi); |
979 | 0 | dev->start = p; |
980 | 0 | if (glyph == -2) |
981 | 0 | dev->maybe_bullet = 1; |
982 | 0 | else |
983 | 0 | dev->maybe_bullet = plausible_bullet(c); |
984 | 0 | } |
985 | | |
986 | | /* Henceforth treat such non-glyphs in the usual way. */ |
987 | 0 | if (glyph == -2) |
988 | 0 | glyph = -1; |
989 | | |
990 | | /* Add synthetic space */ |
991 | 0 | if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES)) |
992 | 0 | add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, add_space, flags, dev->flags); |
993 | |
|
994 | 0 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags); |
995 | |
|
996 | 0 | dev->lastchar = c; |
997 | 0 | dev->lastbidi = bidi; |
998 | 0 | dev->lastline = cur_line; |
999 | 0 | dev->lag_pen = p; |
1000 | 0 | dev->pen = q; |
1001 | |
|
1002 | 0 | dev->new_obj = 0; |
1003 | 0 | dev->trm = trm; |
1004 | 0 | } |
1005 | | |
1006 | | static void |
1007 | | fz_add_stext_char(fz_context *ctx, |
1008 | | fz_stext_device *dev, |
1009 | | fz_font *font, |
1010 | | int c, |
1011 | | int glyph, |
1012 | | fz_matrix trm, |
1013 | | float adv, |
1014 | | int wmode, |
1015 | | int bidi, |
1016 | | int force_new_line, |
1017 | | int flags) |
1018 | 0 | { |
1019 | | /* ignore when one unicode character maps to multiple glyphs */ |
1020 | 0 | if (c == -1) |
1021 | 0 | return; |
1022 | | |
1023 | 0 | if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS) |
1024 | 0 | fz_calculate_font_ascender_descender(ctx, font); |
1025 | |
|
1026 | 0 | if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) |
1027 | 0 | { |
1028 | 0 | switch (c) |
1029 | 0 | { |
1030 | 0 | case 0xFB00: /* ff */ |
1031 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1032 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); |
1033 | 0 | return; |
1034 | 0 | case 0xFB01: /* fi */ |
1035 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1036 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags); |
1037 | 0 | return; |
1038 | 0 | case 0xFB02: /* fl */ |
1039 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1040 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags); |
1041 | 0 | return; |
1042 | 0 | case 0xFB03: /* ffi */ |
1043 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1044 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); |
1045 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags); |
1046 | 0 | return; |
1047 | 0 | case 0xFB04: /* ffl */ |
1048 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1049 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); |
1050 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags); |
1051 | 0 | return; |
1052 | 0 | case 0xFB05: /* long st */ |
1053 | 0 | case 0xFB06: /* st */ |
1054 | 0 | fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1055 | 0 | fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags); |
1056 | 0 | return; |
1057 | 0 | } |
1058 | 0 | } |
1059 | | |
1060 | 0 | if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) |
1061 | 0 | { |
1062 | 0 | switch (c) |
1063 | 0 | { |
1064 | 0 | case 0x0009: /* tab */ |
1065 | 0 | case 0x0020: /* space */ |
1066 | 0 | case 0x00A0: /* no-break space */ |
1067 | 0 | case 0x1680: /* ogham space mark */ |
1068 | 0 | case 0x180E: /* mongolian vowel separator */ |
1069 | 0 | case 0x2000: /* en quad */ |
1070 | 0 | case 0x2001: /* em quad */ |
1071 | 0 | case 0x2002: /* en space */ |
1072 | 0 | case 0x2003: /* em space */ |
1073 | 0 | case 0x2004: /* three-per-em space */ |
1074 | 0 | case 0x2005: /* four-per-em space */ |
1075 | 0 | case 0x2006: /* six-per-em space */ |
1076 | 0 | case 0x2007: /* figure space */ |
1077 | 0 | case 0x2008: /* punctuation space */ |
1078 | 0 | case 0x2009: /* thin space */ |
1079 | 0 | case 0x200A: /* hair space */ |
1080 | 0 | case 0x202F: /* narrow no-break space */ |
1081 | 0 | case 0x205F: /* medium mathematical space */ |
1082 | 0 | case 0x3000: /* ideographic space */ |
1083 | 0 | c = ' '; |
1084 | 0 | } |
1085 | 0 | } |
1086 | |
|
1087 | 0 | fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1088 | 0 | } |
1089 | | |
1090 | | static fz_rect |
1091 | | current_clip(fz_context *ctx, fz_stext_device *dev) |
1092 | 0 | { |
1093 | 0 | fz_rect r = fz_infinite_rect; |
1094 | |
|
1095 | 0 | if (dev->flags & FZ_STEXT_CLIP) |
1096 | 0 | { |
1097 | 0 | r = fz_device_current_scissor(ctx, &dev->super); |
1098 | 0 | r = fz_intersect_rect(r, dev->page->mediabox); |
1099 | 0 | } |
1100 | 0 | if (dev->flags & FZ_STEXT_CLIP_RECT) |
1101 | 0 | r = fz_intersect_rect(r, dev->opts.clip); |
1102 | |
|
1103 | 0 | return r; |
1104 | 0 | } |
1105 | | |
1106 | | static void |
1107 | | do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags) |
1108 | 0 | { |
1109 | 0 | fz_font *font = span->font; |
1110 | 0 | fz_matrix tm = span->trm; |
1111 | 0 | float adv; |
1112 | 0 | int unicode; |
1113 | 0 | int i; |
1114 | |
|
1115 | 0 | for (i = start; i < end; i++) |
1116 | 0 | { |
1117 | 0 | if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) |
1118 | 0 | { |
1119 | 0 | fz_rect r = current_clip(ctx, dev); |
1120 | 0 | if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r)) |
1121 | 0 | { |
1122 | 0 | dev->last.clipped = 1; |
1123 | 0 | continue; |
1124 | 0 | } |
1125 | 0 | } |
1126 | 0 | dev->last.clipped = 0; |
1127 | | |
1128 | | /* Calculate new pen location and delta */ |
1129 | 0 | tm.e = span->items[i].x; |
1130 | 0 | tm.f = span->items[i].y; |
1131 | 0 | dev->last.trm = fz_concat(tm, ctm); |
1132 | 0 | dev->last.bidi_level = span->bidi_level; |
1133 | 0 | dev->last.wmode = span->wmode; |
1134 | 0 | if (font != dev->last.font) |
1135 | 0 | { |
1136 | 0 | fz_drop_font(ctx, dev->last.font); |
1137 | 0 | dev->last.font = fz_keep_font(ctx, font); |
1138 | 0 | } |
1139 | 0 | dev->last.valid = 1; |
1140 | 0 | dev->last.flags = flags; |
1141 | | |
1142 | | /* Calculate bounding box and new pen position based on font metrics */ |
1143 | 0 | if (span->items[i].gid >= 0) |
1144 | 0 | adv = span->items[i].adv; |
1145 | 0 | else |
1146 | 0 | adv = 0; |
1147 | |
|
1148 | 0 | unicode = span->items[i].ucs; |
1149 | 0 | if (unicode == FZ_REPLACEMENT_CHARACTER) |
1150 | 0 | { |
1151 | 0 | if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE) |
1152 | 0 | { |
1153 | 0 | unicode = span->items[i].cid; |
1154 | 0 | flags |= FZ_STEXT_UNICODE_IS_CID; |
1155 | 0 | } |
1156 | 0 | else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE) |
1157 | 0 | { |
1158 | 0 | unicode = span->items[i].gid; |
1159 | 0 | flags |= FZ_STEXT_UNICODE_IS_GID; |
1160 | 0 | } |
1161 | 0 | } |
1162 | | |
1163 | | /* Send the chars we have through. */ |
1164 | 0 | fz_add_stext_char(ctx, dev, font, |
1165 | 0 | unicode, |
1166 | 0 | span->items[i].gid, |
1167 | 0 | dev->last.trm, |
1168 | 0 | adv, |
1169 | 0 | dev->last.wmode, |
1170 | 0 | dev->last.bidi_level, |
1171 | 0 | (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), |
1172 | 0 | flags); |
1173 | 0 | } |
1174 | 0 | } |
1175 | | |
1176 | | static int |
1177 | | rune_index(const char *utf8, size_t idx) |
1178 | 0 | { |
1179 | 0 | int rune; |
1180 | |
|
1181 | 0 | do |
1182 | 0 | { |
1183 | 0 | int len = fz_chartorune(&rune, utf8); |
1184 | 0 | if (rune == 0) |
1185 | 0 | return -1; |
1186 | 0 | utf8 += len; |
1187 | 0 | } |
1188 | 0 | while (idx--); |
1189 | | |
1190 | 0 | return rune; |
1191 | 0 | } |
1192 | | |
1193 | | static void |
1194 | | flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i, int end, float adv) |
1195 | 0 | { |
1196 | 0 | if (*actualtext == 0) |
1197 | 0 | return; |
1198 | | |
1199 | 0 | if (!dev->last.valid) |
1200 | 0 | return; |
1201 | | |
1202 | 0 | if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) |
1203 | 0 | if (dev->last.clipped) |
1204 | 0 | return; |
1205 | | |
1206 | 0 | if (adv != 0) |
1207 | 0 | { |
1208 | 0 | const char *at = actualtext; |
1209 | 0 | int j = i; |
1210 | |
|
1211 | 0 | while (end < 0 || (end >= 0 && i < end)) |
1212 | 0 | { |
1213 | 0 | int rune; |
1214 | 0 | at += fz_chartorune(&rune, at); |
1215 | |
|
1216 | 0 | if (rune == 0) |
1217 | 0 | break; |
1218 | 0 | j++; |
1219 | 0 | } |
1220 | |
|
1221 | 0 | if (j != i) |
1222 | 0 | adv /= (j - i); |
1223 | 0 | } |
1224 | |
|
1225 | 0 | while (end < 0 || (end >= 0 && i < end)) |
1226 | 0 | { |
1227 | 0 | int rune; |
1228 | 0 | actualtext += fz_chartorune(&rune, actualtext); |
1229 | |
|
1230 | 0 | if (rune == 0) |
1231 | 0 | break; |
1232 | | |
1233 | 0 | dev->last.trm.e = dev->pen.x; |
1234 | 0 | dev->last.trm.f = dev->pen.y; |
1235 | |
|
1236 | 0 | fz_add_stext_char(ctx, dev, dev->last.font, |
1237 | 0 | rune, |
1238 | 0 | -2, |
1239 | 0 | dev->last.trm, |
1240 | 0 | adv, |
1241 | 0 | dev->last.wmode, |
1242 | 0 | dev->last.bidi_level, |
1243 | 0 | (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), |
1244 | 0 | dev->last.flags); |
1245 | 0 | i++; |
1246 | 0 | } |
1247 | 0 | } |
1248 | | |
1249 | | static void |
1250 | | do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags) |
1251 | 0 | { |
1252 | | /* We are within an actualtext block. This means we can't just add the chars |
1253 | | * as they are. We need to add the chars as they are meant to be. Sadly the |
1254 | | * actualtext mechanism doesn't help us at all with positioning. */ |
1255 | 0 | fz_font *font = span->font; |
1256 | 0 | fz_matrix tm = span->trm; |
1257 | 0 | float adv; |
1258 | 0 | int start, i, end; |
1259 | 0 | char *actualtext = mt->text; |
1260 | 0 | size_t z = fz_utflen(actualtext); |
1261 | | |
1262 | | /* If actualtext is empty, nothing to do! */ |
1263 | 0 | if (z == 0) |
1264 | 0 | return; |
1265 | | |
1266 | | /* Now, we HOPE that the creator of a PDF will minimise the actual text |
1267 | | * differences, so that we'll get: |
1268 | | * "Politicians <Actualtext="lie">fib</ActualText>, always." |
1269 | | * rather than: |
1270 | | * "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText> |
1271 | | * but experience with PDF files tells us that this won't always be the case. |
1272 | | * |
1273 | | * We try to minimise the actualtext section here, just in case. |
1274 | | */ |
1275 | | |
1276 | | /* Spot a matching prefix and send it. */ |
1277 | 0 | for (start = 0; start < span->len; start++) |
1278 | 0 | { |
1279 | 0 | int rune; |
1280 | 0 | int len = fz_chartorune(&rune, actualtext); |
1281 | 0 | if (span->items[start].ucs != rune || rune == 0) |
1282 | 0 | break; |
1283 | 0 | actualtext += len; z--; |
1284 | 0 | } |
1285 | 0 | if (start != 0) |
1286 | 0 | do_extract(ctx, dev, span, ctm, 0, start, flags); |
1287 | |
|
1288 | 0 | if (start == span->len) |
1289 | 0 | { |
1290 | | /* The prefix has consumed all this object. Just shorten the actualtext and we'll |
1291 | | * catch the rest next time. */ |
1292 | 0 | z = strlen(actualtext)+1; |
1293 | 0 | memmove(mt->text, actualtext, z); |
1294 | 0 | return; |
1295 | 0 | } |
1296 | | |
1297 | | /* We haven't consumed the whole string, so there must be runes left. |
1298 | | * Shut coverity up. */ |
1299 | 0 | assert(z != 0); |
1300 | | |
1301 | | /* Spot a matching postfix. Can't send it til the end. */ |
1302 | 0 | for (end = span->len; end > start; end--) |
1303 | 0 | { |
1304 | | /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */ |
1305 | 0 | int rune = rune_index(actualtext, z-1); |
1306 | 0 | if (span->items[end-1].ucs != rune) |
1307 | 0 | break; |
1308 | 0 | z--; |
1309 | 0 | } |
1310 | | /* So we can send end -> span->len at the end. */ |
1311 | | |
1312 | | /* So we have at least SOME chars that don't match. */ |
1313 | | /* Now, do the difficult bit in the middle.*/ |
1314 | | /* items[start..end] have to be sent with actualtext[start..z] */ |
1315 | 0 | for (i = start; i < end; i++) |
1316 | 0 | { |
1317 | 0 | fz_text_item *item = &span->items[i]; |
1318 | 0 | int rune = -1; |
1319 | |
|
1320 | 0 | if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) |
1321 | 0 | { |
1322 | 0 | fz_rect r = current_clip(ctx, dev); |
1323 | 0 | if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r)) |
1324 | 0 | { |
1325 | 0 | dev->last.clipped = 1; |
1326 | 0 | continue; |
1327 | 0 | } |
1328 | 0 | } |
1329 | 0 | dev->last.clipped = 0; |
1330 | |
|
1331 | 0 | if ((size_t)i < z) |
1332 | 0 | actualtext += fz_chartorune(&rune, actualtext); |
1333 | | |
1334 | | /* Calculate new pen location and delta */ |
1335 | 0 | tm.e = item->x; |
1336 | 0 | tm.f = item->y; |
1337 | 0 | dev->last.trm = fz_concat(tm, ctm); |
1338 | 0 | dev->last.bidi_level = span->bidi_level; |
1339 | 0 | dev->last.wmode = span->wmode; |
1340 | 0 | if (font != dev->last.font) |
1341 | 0 | { |
1342 | 0 | fz_drop_font(ctx, dev->last.font); |
1343 | 0 | dev->last.font = fz_keep_font(ctx, font); |
1344 | 0 | } |
1345 | 0 | dev->last.valid = 1; |
1346 | 0 | dev->last.flags = flags; |
1347 | | |
1348 | | /* Calculate bounding box and new pen position based on font metrics */ |
1349 | 0 | if (item->gid >= 0) |
1350 | 0 | adv = item->adv; |
1351 | 0 | else |
1352 | 0 | adv = 0; |
1353 | |
|
1354 | 0 | fz_add_stext_char(ctx, dev, font, |
1355 | 0 | rune, |
1356 | 0 | span->items[i].gid, |
1357 | 0 | dev->last.trm, |
1358 | 0 | adv, |
1359 | 0 | dev->last.wmode, |
1360 | 0 | dev->last.bidi_level, |
1361 | 0 | (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), |
1362 | 0 | flags); |
1363 | 0 | } |
1364 | | |
1365 | | /* If we haven't spotted a postfix by this point, then don't force ourselves to output |
1366 | | * any more of the actualtext at this point. We might get a new text object that matches |
1367 | | * more of it. */ |
1368 | 0 | if (end == span->len) |
1369 | 0 | { |
1370 | | /* Shorten actualtext and exit. */ |
1371 | 0 | z = strlen(actualtext)+1; |
1372 | 0 | memmove(mt->text, actualtext, z); |
1373 | 0 | return; |
1374 | 0 | } |
1375 | | |
1376 | | /* if this is the first text on the page, and the actual text suffix matches the entire |
1377 | | * span text, then no font will have been set above, so set the last used font to the |
1378 | | * span font since flush_actualtext() assumes that a font has been set. |
1379 | | */ |
1380 | 0 | if (!dev->last.font) |
1381 | 0 | dev->last.font = fz_keep_font(ctx, font); |
1382 | | |
1383 | | /* We found a matching postfix. It seems likely that this is going to be the only |
1384 | | * text object we get, so send any remaining actualtext now. */ |
1385 | 0 | flush_actualtext(ctx, dev, actualtext, i, i + (int)strlen(actualtext) - (span->len - end), 0); |
1386 | | |
1387 | | /* Send the postfix */ |
1388 | 0 | if (end != span->len) |
1389 | 0 | do_extract(ctx, dev, span, ctm, end, span->len, flags); |
1390 | |
|
1391 | 0 | mt->text[0] = 0; |
1392 | 0 | } |
1393 | | |
1394 | | static void |
1395 | | fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags) |
1396 | 0 | { |
1397 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1398 | 0 | metatext_t *mt = NULL; |
1399 | |
|
1400 | 0 | if (span->len == 0) |
1401 | 0 | return; |
1402 | | |
1403 | | /* Are we in an actualtext? */ |
1404 | 0 | if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT)) |
1405 | 0 | mt = find_actualtext(dev); |
1406 | |
|
1407 | 0 | if (mt) |
1408 | 0 | do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags); |
1409 | 0 | else |
1410 | 0 | do_extract(ctx, dev, span, ctm, 0, span->len, flags); |
1411 | 0 | } |
1412 | | |
1413 | | static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha) |
1414 | 0 | { |
1415 | 0 | float rgb[3]; |
1416 | 0 | fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params); |
1417 | 0 | return |
1418 | 0 | (((uint32_t) fz_clampi(alpha * 255 + 0.5f, 0, 255)) << 24) | |
1419 | 0 | (((uint32_t) fz_clampi(rgb[0] * 255 + 0.5f, 0, 255)) << 16) | |
1420 | 0 | (((uint32_t) fz_clampi(rgb[1] * 255 + 0.5f, 0, 255)) << 8) | |
1421 | 0 | (((uint32_t) fz_clampi(rgb[2] * 255 + 0.5f, 0, 255))); |
1422 | 0 | } |
1423 | | |
1424 | | static void |
1425 | | fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, |
1426 | | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
1427 | 0 | { |
1428 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1429 | 0 | fz_text_span *span; |
1430 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1431 | 0 | return; |
1432 | 0 | tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha); |
1433 | 0 | tdev->new_obj = 1; |
1434 | 0 | for (span = text->head; span; span = span->next) |
1435 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED); |
1436 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1437 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1438 | 0 | } |
1439 | | |
1440 | | static void |
1441 | | fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, |
1442 | | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
1443 | 0 | { |
1444 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1445 | 0 | fz_text_span *span; |
1446 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1447 | 0 | return; |
1448 | 0 | tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha); |
1449 | 0 | tdev->new_obj = 1; |
1450 | 0 | for (span = text->head; span; span = span->next) |
1451 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED); |
1452 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1453 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1454 | 0 | } |
1455 | | |
1456 | | static void |
1457 | | fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor) |
1458 | 0 | { |
1459 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1460 | 0 | fz_text_span *span; |
1461 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1462 | 0 | return; |
1463 | 0 | tdev->color = 0; |
1464 | 0 | tdev->new_obj = 1; |
1465 | 0 | for (span = text->head; span; span = span->next) |
1466 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED); |
1467 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1468 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1469 | 0 | } |
1470 | | |
1471 | | static void |
1472 | | fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) |
1473 | 0 | { |
1474 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1475 | 0 | fz_text_span *span; |
1476 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1477 | 0 | return; |
1478 | 0 | tdev->color = 0; |
1479 | 0 | tdev->new_obj = 1; |
1480 | 0 | for (span = text->head; span; span = span->next) |
1481 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED); |
1482 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1483 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1484 | 0 | } |
1485 | | |
1486 | | static void |
1487 | | fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm) |
1488 | 0 | { |
1489 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1490 | 0 | fz_text_span *span; |
1491 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1492 | 0 | return; |
1493 | 0 | tdev->color = 0; |
1494 | 0 | tdev->new_obj = 1; |
1495 | 0 | for (span = text->head; span; span = span->next) |
1496 | 0 | fz_stext_extract(ctx, tdev, span, ctm, 0); |
1497 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1498 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1499 | 0 | } |
1500 | | |
1501 | | static void |
1502 | | fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text) |
1503 | 0 | { |
1504 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1505 | 0 | metatext_t *mt = find_actualtext(tdev); |
1506 | 0 | char *new_text = NULL; |
1507 | |
|
1508 | 0 | if (mt != NULL && meta == FZ_METATEXT_ACTUALTEXT) |
1509 | 0 | flush_actualtext(ctx, tdev, mt->text, 0, -1, 0); |
1510 | |
|
1511 | 0 | if (meta == FZ_METATEXT_ACTUALTEXT) |
1512 | 0 | tdev->last.valid = 0; |
1513 | |
|
1514 | 0 | new_text = text ? fz_strdup(ctx, text) : NULL; |
1515 | |
|
1516 | 0 | fz_try(ctx) |
1517 | 0 | { |
1518 | 0 | mt = fz_malloc_struct(ctx, metatext_t); |
1519 | |
|
1520 | 0 | mt->prev = tdev->metatext; |
1521 | 0 | tdev->metatext = mt; |
1522 | 0 | mt->type = meta; |
1523 | 0 | mt->text = new_text; |
1524 | 0 | mt->bounds = fz_empty_rect; |
1525 | 0 | } |
1526 | 0 | fz_catch(ctx) |
1527 | 0 | { |
1528 | 0 | fz_free(ctx, new_text); |
1529 | 0 | fz_rethrow(ctx); |
1530 | 0 | } |
1531 | 0 | } |
1532 | | |
1533 | | static void |
1534 | | pop_metatext(fz_context *ctx, fz_stext_device *dev) |
1535 | 0 | { |
1536 | 0 | metatext_t *prev; |
1537 | 0 | fz_rect bounds; |
1538 | |
|
1539 | 0 | if (!dev->metatext) |
1540 | 0 | return; |
1541 | | |
1542 | 0 | prev = dev->metatext->prev; |
1543 | 0 | bounds = dev->metatext->bounds; |
1544 | 0 | fz_free(ctx, dev->metatext->text); |
1545 | 0 | fz_free(ctx, dev->metatext); |
1546 | 0 | dev->metatext = prev; |
1547 | 0 | if (prev) |
1548 | 0 | prev->bounds = fz_union_rect(prev->bounds, bounds); |
1549 | 0 | } |
1550 | | |
1551 | | static void |
1552 | | fz_stext_end_metatext(fz_context *ctx, fz_device *dev) |
1553 | 0 | { |
1554 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1555 | 0 | fz_font *myfont = NULL; |
1556 | |
|
1557 | 0 | if (!tdev->metatext) |
1558 | 0 | return; /* Mismatched pop. Live with it. */ |
1559 | | |
1560 | 0 | if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT || (tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0) |
1561 | 0 | { |
1562 | | /* We only deal with ActualText here. Just pop anything else off, |
1563 | | * and we're done. */ |
1564 | 0 | pop_metatext(ctx, tdev); |
1565 | 0 | return; |
1566 | 0 | } |
1567 | | |
1568 | | /* If we have a 'last' text position, send the content after that. */ |
1569 | 0 | if (tdev->last.valid) |
1570 | 0 | { |
1571 | 0 | flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1, 0); |
1572 | 0 | pop_metatext(ctx, tdev); |
1573 | 0 | tdev->last.valid = 0; |
1574 | 0 | return; |
1575 | 0 | } |
1576 | | |
1577 | | /* Unless we have collected a rectangle for content that encloses the actual text, |
1578 | | * we can't do anything. */ |
1579 | 0 | if (fz_is_empty_rect(tdev->metatext->bounds)) |
1580 | 0 | { |
1581 | 0 | if ((dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) == 0 && tdev->metatext->text[0]) |
1582 | 0 | fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned."); |
1583 | 0 | pop_metatext(ctx, tdev); |
1584 | 0 | return; |
1585 | 0 | } |
1586 | | |
1587 | | /* We have a rectangle, so send the text to fill that. */ |
1588 | 0 | tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0; |
1589 | 0 | tdev->last.trm.b = 0; |
1590 | 0 | tdev->last.trm.c = 0; |
1591 | 0 | tdev->last.trm.d = tdev->metatext->bounds.y0 - tdev->metatext->bounds.y1; |
1592 | 0 | tdev->last.trm.e = tdev->metatext->bounds.x0; |
1593 | 0 | tdev->last.trm.f = tdev->metatext->bounds.y1; |
1594 | 0 | tdev->last.valid = 1; |
1595 | |
|
1596 | 0 | fz_var(myfont); |
1597 | |
|
1598 | 0 | fz_try(ctx) |
1599 | 0 | { |
1600 | 0 | if (tdev->last.font == NULL) |
1601 | 0 | { |
1602 | 0 | myfont = fz_new_base14_font(ctx, "Helvetica"); |
1603 | 0 | tdev->last.font = myfont; |
1604 | 0 | } |
1605 | 0 | flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1, 1); |
1606 | 0 | pop_metatext(ctx, tdev); |
1607 | 0 | } |
1608 | 0 | fz_always(ctx) |
1609 | 0 | { |
1610 | 0 | if (myfont) |
1611 | 0 | { |
1612 | 0 | tdev->last.font = NULL; |
1613 | 0 | fz_drop_font(ctx, myfont); |
1614 | 0 | } |
1615 | 0 | } |
1616 | 0 | fz_catch(ctx) |
1617 | 0 | fz_rethrow(ctx); |
1618 | 0 | } |
1619 | | |
1620 | | |
1621 | | /* Images and shadings */ |
1622 | | |
1623 | | static void |
1624 | | fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) |
1625 | 0 | { |
1626 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1627 | 0 | fz_rect *bounds = actualtext_bounds(tdev); |
1628 | | |
1629 | | /* If there is an actualtext in force, update its bounds. */ |
1630 | 0 | if (bounds) |
1631 | 0 | { |
1632 | 0 | static const fz_rect unit = { 0, 0, 1, 1 }; |
1633 | 0 | *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm)); |
1634 | 0 | } |
1635 | | |
1636 | | /* Unless we are being told to preserve images, nothing to do here. */ |
1637 | 0 | if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0) |
1638 | 0 | return; |
1639 | | |
1640 | | /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ |
1641 | 0 | if (alpha >= 0.5f) |
1642 | 0 | { |
1643 | 0 | fz_stext_block *block; |
1644 | 0 | flush_lazy_vectors(ctx, tdev->page, tdev); |
1645 | 0 | block = add_image_block_to_page(ctx, tdev->page, ctm, img, tdev->id); |
1646 | 0 | if (tdev->opts.flags & FZ_STEXT_CLIP) |
1647 | 0 | { |
1648 | 0 | fz_rect clip = fz_device_current_scissor(ctx, dev); |
1649 | 0 | clip = fz_intersect_rect(clip, tdev->page->mediabox); |
1650 | 0 | block->bbox = fz_intersect_rect(block->bbox, clip); |
1651 | 0 | } |
1652 | 0 | } |
1653 | 0 | } |
1654 | | |
1655 | | static void |
1656 | | fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, |
1657 | | fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params) |
1658 | 0 | { |
1659 | 0 | fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); |
1660 | 0 | } |
1661 | | |
1662 | | static fz_image * |
1663 | | fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor) |
1664 | 0 | { |
1665 | 0 | fz_matrix ctm = *in_out_ctm; |
1666 | 0 | fz_pixmap *pix; |
1667 | 0 | fz_image *img = NULL; |
1668 | 0 | fz_rect bounds; |
1669 | 0 | fz_irect bbox; |
1670 | |
|
1671 | 0 | bounds = fz_bound_shade(ctx, shade, ctm); |
1672 | 0 | bounds = fz_intersect_rect(bounds, scissor); |
1673 | 0 | bbox = fz_irect_from_rect(bounds); |
1674 | |
|
1675 | 0 | pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background); |
1676 | 0 | fz_try(ctx) |
1677 | 0 | { |
1678 | 0 | if (shade->use_background) |
1679 | 0 | fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params); |
1680 | 0 | else |
1681 | 0 | fz_clear_pixmap(ctx, pix); |
1682 | 0 | fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL); |
1683 | 0 | img = fz_new_image_from_pixmap(ctx, pix, NULL); |
1684 | 0 | } |
1685 | 0 | fz_always(ctx) |
1686 | 0 | fz_drop_pixmap(ctx, pix); |
1687 | 0 | fz_catch(ctx) |
1688 | 0 | fz_rethrow(ctx); |
1689 | | |
1690 | 0 | in_out_ctm->a = pix->w; |
1691 | 0 | in_out_ctm->b = 0; |
1692 | 0 | in_out_ctm->c = 0; |
1693 | 0 | in_out_ctm->d = pix->h; |
1694 | 0 | in_out_ctm->e = pix->x; |
1695 | 0 | in_out_ctm->f = pix->y; |
1696 | 0 | return img; |
1697 | 0 | } |
1698 | | |
1699 | | static void |
1700 | | fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params) |
1701 | 0 | { |
1702 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1703 | 0 | fz_matrix local_ctm; |
1704 | 0 | fz_rect scissor; |
1705 | 0 | fz_image *image; |
1706 | | |
1707 | | /* If we aren't preserving images, don't waste time making the shade. */ |
1708 | 0 | if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0) |
1709 | 0 | { |
1710 | | /* But we do still need to handle actualtext bounds. */ |
1711 | 0 | fz_rect *bounds = actualtext_bounds(tdev); |
1712 | 0 | if (bounds) |
1713 | 0 | *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm)); |
1714 | 0 | return; |
1715 | 0 | } |
1716 | | |
1717 | 0 | local_ctm = ctm; |
1718 | 0 | scissor = fz_device_current_scissor(ctx, dev); |
1719 | 0 | if (dev->flags & FZ_STEXT_CLIP_RECT) |
1720 | 0 | scissor = fz_intersect_rect(scissor, tdev->opts.clip); |
1721 | 0 | scissor = fz_intersect_rect(scissor, tdev->page->mediabox); |
1722 | 0 | image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor); |
1723 | 0 | fz_try(ctx) |
1724 | 0 | fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params); |
1725 | 0 | fz_always(ctx) |
1726 | 0 | fz_drop_image(ctx, image); |
1727 | 0 | fz_catch(ctx) |
1728 | 0 | fz_rethrow(ctx); |
1729 | 0 | } |
1730 | | |
1731 | | static void |
1732 | | fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block) |
1733 | 0 | { |
1734 | 0 | fz_stext_line *line; |
1735 | 0 | fz_stext_char *ch; |
1736 | |
|
1737 | 0 | for ( ; block != NULL; block = block->next) |
1738 | 0 | { |
1739 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
1740 | 0 | if (block->u.s.down) |
1741 | 0 | fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block); |
1742 | 0 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
1743 | 0 | continue; |
1744 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
1745 | 0 | { |
1746 | 0 | int reorder = 0; |
1747 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
1748 | 0 | { |
1749 | 0 | fz_rect ch_box = fz_rect_from_quad(ch->quad); |
1750 | 0 | if (ch == line->first_char) |
1751 | 0 | line->bbox = ch_box; |
1752 | 0 | else |
1753 | 0 | line->bbox = fz_union_rect(line->bbox, ch_box); |
1754 | 0 | if (ch->bidi == 3) |
1755 | 0 | reorder = 1; |
1756 | 0 | } |
1757 | 0 | block->bbox = fz_union_rect(block->bbox, line->bbox); |
1758 | 0 | if (reorder) |
1759 | 0 | reverse_bidi_line(line); |
1760 | 0 | } |
1761 | 0 | } |
1762 | 0 | } |
1763 | | |
1764 | | static void |
1765 | | advance_to_x(fz_point *a, fz_point b, float x) |
1766 | 0 | { |
1767 | 0 | a->y += (b.y - a->y) * (x - a->x) / (b.x - a->x); |
1768 | 0 | a->x = x; |
1769 | 0 | } |
1770 | | |
1771 | | static void |
1772 | | advance_to_y(fz_point *a, fz_point b, float y) |
1773 | 0 | { |
1774 | 0 | a->x += (b.x - a->x) * (y - a->y) / (b.y - a->y); |
1775 | 0 | a->y = y; |
1776 | 0 | } |
1777 | | |
1778 | | static int |
1779 | | line_crosses_rect(fz_point a, fz_point b, fz_rect r) |
1780 | 0 | { |
1781 | | /* Cope with trivial exclusions */ |
1782 | 0 | if (a.x < r.x0 && b.x < r.x0) |
1783 | 0 | return 0; |
1784 | 0 | if (a.x > r.x1 && b.x > r.x1) |
1785 | 0 | return 0; |
1786 | 0 | if (a.y < r.y0 && b.y < r.y0) |
1787 | 0 | return 0; |
1788 | 0 | if (a.y > r.y1 && b.y > r.y1) |
1789 | 0 | return 0; |
1790 | | |
1791 | 0 | if (a.x < r.x0) |
1792 | 0 | advance_to_x(&a, b, r.x0); |
1793 | 0 | if (a.x > r.x1) |
1794 | 0 | advance_to_x(&a, b, r.x1); |
1795 | 0 | if (a.y < r.y0) |
1796 | 0 | advance_to_y(&a, b, r.y0); |
1797 | 0 | if (a.y > r.y1) |
1798 | 0 | advance_to_y(&a, b, r.y1); |
1799 | |
|
1800 | 0 | return fz_is_point_inside_rect(a, r); |
1801 | 0 | } |
1802 | | |
1803 | | static float |
1804 | | calculate_ascent(fz_point p, fz_point origin, fz_point dir) |
1805 | 0 | { |
1806 | 0 | return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x); |
1807 | 0 | } |
1808 | | |
1809 | | /* Create us a rect from the given quad, but extend it downwards |
1810 | | * to allow for underlines that pass under the glyphs. */ |
1811 | | static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size) |
1812 | 0 | { |
1813 | | /* Consider the two rects from A and g respectively. |
1814 | | * |
1815 | | * ul +------+ ur or |
1816 | | * | /\ | ul +------+ ur |
1817 | | * | /__\ | | /''\ | |
1818 | | * |/ \| |( || |
1819 | | * ll +------+ lr | ''''|| |
1820 | | * | ''' | <-expected underline level |
1821 | | * ll +------+ lr |
1822 | | * |
1823 | | * So an underline won't cross A's rect, but will cross g's. |
1824 | | * We want to make a rect that includes a suitable amount of |
1825 | | * space underneath. The information we have available to us |
1826 | | * is summed up here: |
1827 | | * |
1828 | | * ul +---------+ ur |
1829 | | * | | |
1830 | | * | origin | |
1831 | | * |+----------> dir |
1832 | | * | | |
1833 | | * ll +---------+ lr |
1834 | | * |
1835 | | * Consider the distance from ul to the line that passes through |
1836 | | * the origin with direction dir. Similarly, consider the distance |
1837 | | * from ur to the same line. This can be thought of as the 'ascent' |
1838 | | * of this character. |
1839 | | * |
1840 | | * We'd like the distance from ul to ll to be greater than this, so |
1841 | | * as to ensure we cover the possible location where an underline |
1842 | | * might reasonably go. |
1843 | | * |
1844 | | * If we have a line (l) through point A with direction vector u, |
1845 | | * the distance between point P and line(l) is: |
1846 | | * |
1847 | | * d(P,l) = || AP x u || / || u || |
1848 | | * |
1849 | | * where x is the cross product. |
1850 | | * |
1851 | | * For us, because || dir || = 1: |
1852 | | * |
1853 | | * d(ul, origin) = || (origin-ul) x dir || |
1854 | | * |
1855 | | * The cross product is only defined in 3 (or 7!) dimensions, so |
1856 | | * extend both vectors into 3d by defining a 0 z component. |
1857 | | * |
1858 | | * (origin-ul) x dir = [ (origin.y - ul.y) . 0 - 0 . dir.y ] |
1859 | | * [ 0 . dir.x - (origin.x - ul.y) . 0 ] |
1860 | | * [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ] |
1861 | | * |
1862 | | * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x |
1863 | | */ |
1864 | 0 | float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2; |
1865 | 0 | fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y }; |
1866 | 0 | fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y }; |
1867 | 0 | float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2; |
1868 | 0 | int neg = 0; |
1869 | 0 | float extra_rise = 0; |
1870 | | |
1871 | | /* Spaces will have 0 ascent. underscores will have small ascent. |
1872 | | * We want a sane ascent to be able to spot strikeouts, but not |
1873 | | * so big that it incorporates lines above the text, like borders. */ |
1874 | 0 | if (ascent < 0.75*size) |
1875 | 0 | extra_rise = 0.75*size - ascent; |
1876 | | |
1877 | | /* We'd like height to be at least ascent + 1/4 size */ |
1878 | 0 | if (height < 0) |
1879 | 0 | neg = 1, height = -height; |
1880 | 0 | if (height < ascent + size * 0.25f) |
1881 | 0 | height = ascent + size * 0.25f; |
1882 | |
|
1883 | 0 | height -= ascent; |
1884 | 0 | if (neg) |
1885 | 0 | height = -height; |
1886 | 0 | quad.ll.x += - height * dir.y; |
1887 | 0 | quad.ll.y += height * dir.x; |
1888 | 0 | quad.lr.x += - height * dir.y; |
1889 | 0 | quad.lr.y += height * dir.x; |
1890 | 0 | quad.ul.x -= - extra_rise * dir.y; |
1891 | 0 | quad.ul.y -= extra_rise * dir.x; |
1892 | 0 | quad.ur.x -= - extra_rise * dir.y; |
1893 | 0 | quad.ur.y -= extra_rise * dir.x; |
1894 | |
|
1895 | 0 | return fz_rect_from_quad(quad); |
1896 | 0 | } |
1897 | | |
1898 | | static int feq(float a,float b) |
1899 | 0 | { |
1900 | 0 | #define EPSILON 0.00001 |
1901 | 0 | a -= b; |
1902 | 0 | if (a < 0) |
1903 | 0 | a = -a; |
1904 | 0 | return a < EPSILON; |
1905 | 0 | } |
1906 | | |
1907 | | static void |
1908 | | check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir, float thickness) |
1909 | 0 | { |
1910 | 0 | for ( ; block; block = block->next) |
1911 | 0 | { |
1912 | 0 | fz_stext_line *line; |
1913 | |
|
1914 | 0 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
1915 | 0 | continue; |
1916 | | |
1917 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
1918 | 0 | { |
1919 | 0 | fz_stext_char *ch; |
1920 | |
|
1921 | 0 | if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) && |
1922 | 0 | (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y))) |
1923 | 0 | continue; |
1924 | | |
1925 | | /* Matching directions... */ |
1926 | | |
1927 | | /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check |
1928 | | * chars. - FIXME: Now we do! */ |
1929 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
1930 | 0 | { |
1931 | 0 | fz_point up; |
1932 | 0 | float dx, dy, dot; |
1933 | 0 | fz_rect ch_box; |
1934 | | |
1935 | | /* If the thickness is more than a 1/4 of the size, it's a highlight, not a |
1936 | | * line! */ |
1937 | 0 | if (ch->size < thickness*4) |
1938 | 0 | continue; |
1939 | | |
1940 | 0 | ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size); |
1941 | |
|
1942 | 0 | if (!line_crosses_rect(from, to, ch_box)) |
1943 | 0 | continue; |
1944 | | |
1945 | | /* Is this a strikeout or an underline? */ |
1946 | | |
1947 | | /* The baseline moves from ch->origin in the direction line->dir */ |
1948 | 0 | up.x = line->dir.y; |
1949 | 0 | up.y = -line->dir.x; |
1950 | | |
1951 | | /* How far is our line displaced from the line through the origin? */ |
1952 | 0 | dx = from.x - ch->origin.x; |
1953 | 0 | dy = from.y - ch->origin.y; |
1954 | | /* Dot product with up. up is normalised */ |
1955 | 0 | dot = dx * up.x + dy * up.y; |
1956 | |
|
1957 | 0 | if (dot > 0 && dot <= 0.8f * ch->font->ascender * ch->size) |
1958 | 0 | ch->flags |= FZ_STEXT_STRIKEOUT; |
1959 | 0 | else |
1960 | 0 | ch->flags |= FZ_STEXT_UNDERLINE; |
1961 | 0 | } |
1962 | 0 | } |
1963 | 0 | } |
1964 | 0 | } |
1965 | | |
1966 | | static void |
1967 | | check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page) |
1968 | 0 | { |
1969 | 0 | int i, n = tdev->rect_len; |
1970 | |
|
1971 | 0 | for (i = 0; i < n; i++) |
1972 | 0 | { |
1973 | 0 | fz_point from = tdev->rects[i].from; |
1974 | 0 | fz_point to = tdev->rects[i].to; |
1975 | 0 | float thickness = tdev->rects[i].thickness; |
1976 | 0 | fz_point dir; |
1977 | 0 | dir.x = to.x - from.x; |
1978 | 0 | dir.y = to.y - from.y; |
1979 | 0 | dir = fz_normalize_vector(dir); |
1980 | |
|
1981 | 0 | check_strikeout(ctx, page->first_block, from, to, dir, thickness); |
1982 | 0 | } |
1983 | 0 | } |
1984 | | |
1985 | | static void |
1986 | | fz_stext_close_device(fz_context *ctx, fz_device *dev) |
1987 | 0 | { |
1988 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1989 | 0 | fz_stext_page *page = tdev->page; |
1990 | |
|
1991 | 0 | if ((tdev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(tdev->lastchar) && tdev->lastline != NULL) |
1992 | 0 | tdev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED; |
1993 | |
|
1994 | 0 | flush_lazy_vectors(ctx, page, tdev); |
1995 | |
|
1996 | 0 | fixup_bboxes_and_bidi(ctx, page->first_block); |
1997 | |
|
1998 | 0 | if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) |
1999 | 0 | check_rects_for_strikeout(ctx, tdev, page); |
2000 | | |
2001 | | /* TODO: smart sorting of blocks and lines in reading order */ |
2002 | | /* TODO: unicode NFC normalization */ |
2003 | |
|
2004 | 0 | if (tdev->opts.flags & FZ_STEXT_SEGMENT) |
2005 | 0 | fz_segment_stext_page(ctx, page); |
2006 | |
|
2007 | 0 | if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK) |
2008 | 0 | fz_paragraph_break(ctx, page); |
2009 | |
|
2010 | 0 | if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT) |
2011 | 0 | fz_table_hunt(ctx, page); |
2012 | 0 | } |
2013 | | |
2014 | | static void |
2015 | | fz_stext_drop_device(fz_context *ctx, fz_device *dev) |
2016 | 0 | { |
2017 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2018 | 0 | fz_drop_text(ctx, tdev->lasttext); |
2019 | 0 | fz_drop_font(ctx, tdev->last.font); |
2020 | 0 | while (tdev->metatext) |
2021 | 0 | pop_metatext(ctx, tdev); |
2022 | |
|
2023 | 0 | fz_free(ctx, tdev->rects); |
2024 | 0 | } |
2025 | | |
2026 | | static int |
2027 | | val_is_rect(const char *val, fz_rect *rp) |
2028 | 0 | { |
2029 | 0 | fz_rect r; |
2030 | 0 | const char *s; |
2031 | |
|
2032 | 0 | s = strchr(val, ':'); |
2033 | 0 | if (s == NULL || s == val) |
2034 | 0 | return 0; |
2035 | 0 | r.x0 = fz_atof(val); |
2036 | 0 | val = s+1; |
2037 | 0 | s = strchr(val, ':'); |
2038 | 0 | if (s == NULL || s == val) |
2039 | 0 | return 0; |
2040 | 0 | r.y0 = fz_atof(val); |
2041 | 0 | val = s+1; |
2042 | 0 | s = strchr(val, ':'); |
2043 | 0 | if (s == NULL || s == val) |
2044 | 0 | return 0; |
2045 | 0 | r.x1 = fz_atof(val); |
2046 | 0 | val = s+1; |
2047 | 0 | r.y1 = fz_atof(val); |
2048 | |
|
2049 | 0 | *rp = r; |
2050 | |
|
2051 | 0 | return 1; |
2052 | 0 | } |
2053 | | |
2054 | | void fz_init_stext_options(fz_context *ctx, fz_stext_options *opts) |
2055 | 0 | { |
2056 | 0 | memset(opts, 0, sizeof *opts); |
2057 | |
|
2058 | 0 | opts->flags |= FZ_STEXT_CLIP; |
2059 | 0 | opts->scale = 1; |
2060 | 0 | } |
2061 | | |
2062 | | fz_stext_options * |
2063 | | fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string) |
2064 | 0 | { |
2065 | 0 | fz_options *options = fz_new_options(ctx, string); |
2066 | 0 | fz_try(ctx) |
2067 | 0 | { |
2068 | 0 | fz_init_stext_options(ctx, opts); |
2069 | 0 | fz_apply_stext_options(ctx, opts, options); |
2070 | 0 | fz_throw_on_unused_options(ctx, options, "stext"); |
2071 | 0 | } |
2072 | 0 | fz_always(ctx) |
2073 | 0 | fz_drop_options(ctx, options); |
2074 | 0 | fz_catch(ctx) |
2075 | 0 | fz_rethrow(ctx); |
2076 | 0 | return opts; |
2077 | 0 | } |
2078 | | |
2079 | | #define SETCLEARBOOL(A, B, C) \ |
2080 | 0 | (A) = (B) ? ((A) | (C)) : ((A) & ~(C)) |
2081 | | |
2082 | | void |
2083 | | fz_apply_stext_options(fz_context *ctx, fz_stext_options *opts, fz_options *string) |
2084 | 0 | { |
2085 | 0 | const char *val; |
2086 | 0 | float x; |
2087 | 0 | int b; |
2088 | | |
2089 | | /* when adding options, remember to update fz_stext_options_usage above */ |
2090 | |
|
2091 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-ligatures", &b)) |
2092 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_LIGATURES); |
2093 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-whitespace", &b)) |
2094 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_WHITESPACE); |
2095 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-images", &b)) |
2096 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_IMAGES); |
2097 | 0 | if (fz_lookup_option_boolean(ctx, string, "inhibit-spaces", &b)) |
2098 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_INHIBIT_SPACES); |
2099 | 0 | if (fz_lookup_option_boolean(ctx, string, "dehyphenate", &b)) |
2100 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_DEHYPHENATE); |
2101 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-spans", &b)) |
2102 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_SPANS); |
2103 | 0 | if (fz_lookup_option_boolean(ctx, string, "structured", &b)) |
2104 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STRUCTURE); |
2105 | 0 | if (fz_lookup_option_boolean(ctx, string, "use-cid-for-unknown-unicode", &b)) |
2106 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE); |
2107 | 0 | if (fz_lookup_option_boolean(ctx, string, "use-gid-for-unknown-unicode", &b)) |
2108 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE); |
2109 | 0 | if (fz_lookup_option_boolean(ctx, string, "accurate-bboxes", &b)) |
2110 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_BBOXES); |
2111 | 0 | if (fz_lookup_option_boolean(ctx, string, "vectors", &b)) |
2112 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_VECTORS); |
2113 | 0 | if (fz_lookup_option_boolean(ctx, string, "lazy-vectors", &b)) |
2114 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_LAZY_VECTORS); |
2115 | 0 | if (fz_lookup_option_boolean(ctx, string, "fuzzy-vectors", &b)) |
2116 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_FUZZY_VECTORS); |
2117 | 0 | if (fz_lookup_option_boolean(ctx, string, "ignore-actualtext", &b)) |
2118 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_IGNORE_ACTUALTEXT); |
2119 | 0 | if (fz_lookup_option_boolean(ctx, string, "segment", &b)) |
2120 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_SEGMENT); |
2121 | 0 | if (fz_lookup_option_boolean(ctx, string, "paragraph-break", &b)) |
2122 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PARAGRAPH_BREAK); |
2123 | 0 | if (fz_lookup_option_boolean(ctx, string, "table-hunt", &b)) |
2124 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_TABLE_HUNT); |
2125 | 0 | if (fz_lookup_option_boolean(ctx, string, "collect-styles", &b)) |
2126 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STYLES); |
2127 | 0 | if (fz_lookup_option_boolean(ctx, string, "accurate-ascenders", &b)) |
2128 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_ASCENDERS); |
2129 | 0 | if (fz_lookup_option_boolean(ctx, string, "accurate-side-bearings", &b)) |
2130 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_SIDE_BEARINGS); |
2131 | |
|
2132 | 0 | if (fz_lookup_option_boolean(ctx, string, "mediabox-clip", &b)) |
2133 | 0 | { |
2134 | 0 | fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead."); |
2135 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP); |
2136 | 0 | } |
2137 | 0 | if (fz_lookup_option_boolean(ctx, string, "clip", &b)) |
2138 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP); |
2139 | |
|
2140 | 0 | if (fz_lookup_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip)) |
2141 | 0 | opts->flags |= FZ_STEXT_CLIP_RECT; |
2142 | |
|
2143 | 0 | if (fz_lookup_option_float(ctx, string, "resolution", &x)) |
2144 | 0 | opts->scale = x / 96.0f; /* HTML base resolution is 96ppi */ |
2145 | |
|
2146 | 0 | fz_validate_options(ctx, string, "stext"); |
2147 | 0 | } |
2148 | | |
2149 | | typedef struct |
2150 | | { |
2151 | | int fail; |
2152 | | int count; |
2153 | | fz_point corners[4]; |
2154 | | } is_rect_data; |
2155 | | |
2156 | | static void |
2157 | | stash_point(is_rect_data *rd, float x, float y) |
2158 | 0 | { |
2159 | 0 | if (rd->count > 3) |
2160 | 0 | { |
2161 | 0 | rd->fail = 1; |
2162 | 0 | return; |
2163 | 0 | } |
2164 | | |
2165 | 0 | rd->corners[rd->count].x = x; |
2166 | 0 | rd->corners[rd->count].y = y; |
2167 | 0 | rd->count++; |
2168 | 0 | } |
2169 | | |
2170 | | static void |
2171 | | is_rect_moveto(fz_context *ctx, void *arg, float x, float y) |
2172 | 0 | { |
2173 | 0 | is_rect_data *rd = arg; |
2174 | 0 | if (rd->fail) |
2175 | 0 | return; |
2176 | | |
2177 | 0 | if (rd->count != 0) |
2178 | 0 | { |
2179 | 0 | rd->fail = 1; |
2180 | 0 | return; |
2181 | 0 | } |
2182 | 0 | stash_point(rd, x, y); |
2183 | 0 | } |
2184 | | |
2185 | | static void |
2186 | | is_rect_lineto(fz_context *ctx, void *arg, float x, float y) |
2187 | 0 | { |
2188 | 0 | is_rect_data *rd = arg; |
2189 | 0 | if (rd->fail) |
2190 | 0 | return; |
2191 | | |
2192 | 0 | if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y) |
2193 | 0 | return; |
2194 | | |
2195 | 0 | stash_point(rd, x, y); |
2196 | 0 | } |
2197 | | |
2198 | | static void |
2199 | | is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3) |
2200 | 0 | { |
2201 | 0 | is_rect_data *rd = arg; |
2202 | 0 | rd->fail = 1; |
2203 | 0 | } |
2204 | | |
2205 | | static void |
2206 | | is_rect_closepath(fz_context *ctx, void *arg) |
2207 | 0 | { |
2208 | 0 | is_rect_data *rd = arg; |
2209 | 0 | if (rd->fail) |
2210 | 0 | return; |
2211 | 0 | if (rd->count == 3) |
2212 | 0 | stash_point(rd, rd->corners[0].x, rd->corners[0].y); |
2213 | 0 | if (rd->count != 4) |
2214 | 0 | rd->fail = 1; |
2215 | 0 | } |
2216 | | |
2217 | | static int |
2218 | | is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm) |
2219 | 0 | { |
2220 | 0 | float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y; |
2221 | 0 | is_rect_data rd = { 0 }; |
2222 | 0 | static const fz_path_walker walker = |
2223 | 0 | { |
2224 | 0 | is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath |
2225 | 0 | }; |
2226 | 0 | int i; |
2227 | |
|
2228 | 0 | fz_walk_path(ctx, path, &walker, &rd); |
2229 | |
|
2230 | 0 | if (rd.fail) |
2231 | 0 | return 0; |
2232 | | |
2233 | 0 | if (rd.count == 2) |
2234 | 0 | { |
2235 | 0 | stash_point(&rd, rd.corners[1].x, rd.corners[1].y); |
2236 | 0 | stash_point(&rd, rd.corners[0].x, rd.corners[0].y); |
2237 | 0 | } |
2238 | |
|
2239 | 0 | for (i = 0 ; i < 4; i++) |
2240 | 0 | { |
2241 | 0 | fz_point p = fz_transform_point(rd.corners[i], ctm); |
2242 | |
|
2243 | 0 | rd.corners[i].x = p.x; |
2244 | 0 | rd.corners[i].y = p.y; |
2245 | 0 | } |
2246 | | |
2247 | | /* So we have a 4 cornered path. Hopefully something like: |
2248 | | * 0---------1 |
2249 | | * | | |
2250 | | * 3---------2 |
2251 | | * but it might be: |
2252 | | * 0---------3 |
2253 | | * | | |
2254 | | * 1---------2 |
2255 | | */ |
2256 | 0 | while (1) |
2257 | 0 | { |
2258 | 0 | d01x = rd.corners[1].x - rd.corners[0].x; |
2259 | 0 | d01y = rd.corners[1].y - rd.corners[0].y; |
2260 | 0 | d01 = d01x * d01x + d01y * d01y; |
2261 | 0 | d03x = rd.corners[3].x - rd.corners[0].x; |
2262 | 0 | d03y = rd.corners[3].y - rd.corners[0].y; |
2263 | 0 | d03 = d03x * d03x + d03y * d03y; |
2264 | 0 | if(d01 < d03) |
2265 | 0 | { |
2266 | | /* We are the latter case. Transpose it. */ |
2267 | 0 | fz_point p = rd.corners[1]; |
2268 | 0 | rd.corners[1] = rd.corners[3]; |
2269 | 0 | rd.corners[3] = p; |
2270 | 0 | } |
2271 | 0 | else |
2272 | 0 | break; |
2273 | 0 | } |
2274 | 0 | d32x = rd.corners[2].x - rd.corners[3].x; |
2275 | 0 | d32y = rd.corners[2].y - rd.corners[3].y; |
2276 | | |
2277 | | /* So d32x and d01x need to be the same for this to be a strikeout. */ |
2278 | 0 | if (!feq(d32x, d01x) || !feq(d32y, d01y)) |
2279 | 0 | return 0; |
2280 | | |
2281 | | /* We are plausibly a rectangle. */ |
2282 | 0 | *thickness = sqrtf(d03x * d03x + d03y * d03y); |
2283 | |
|
2284 | 0 | from->x = (rd.corners[0].x + rd.corners[3].x)/2; |
2285 | 0 | from->y = (rd.corners[0].y + rd.corners[3].y)/2; |
2286 | 0 | to->x = (rd.corners[1].x + rd.corners[2].x)/2; |
2287 | 0 | to->y = (rd.corners[1].y + rd.corners[2].y)/2; |
2288 | |
|
2289 | 0 | return 1; |
2290 | 0 | } |
2291 | | |
2292 | | static void |
2293 | | check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm) |
2294 | 0 | { |
2295 | 0 | float thickness; |
2296 | 0 | fz_point from, to; |
2297 | | |
2298 | | /* Is this path a thin rectangle (possibly rotated)? If so, then we need to |
2299 | | * consider it as being a strikeout or underline. */ |
2300 | 0 | if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm)) |
2301 | 0 | return; |
2302 | | |
2303 | | /* Add to the list of rects in the device. */ |
2304 | 0 | if (tdev->rect_len == tdev->rect_max) |
2305 | 0 | { |
2306 | 0 | int newmax = tdev->rect_max * 2; |
2307 | 0 | if (newmax == 0) |
2308 | 0 | newmax = 32; |
2309 | |
|
2310 | 0 | tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax); |
2311 | 0 | tdev->rect_max = newmax; |
2312 | 0 | } |
2313 | 0 | tdev->rects[tdev->rect_len].from = from; |
2314 | 0 | tdev->rects[tdev->rect_len].to = to; |
2315 | 0 | tdev->rects[tdev->rect_len].thickness = thickness; |
2316 | 0 | tdev->rect_len++; |
2317 | 0 | } |
2318 | | |
2319 | | static void |
2320 | | add_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, fz_rect bbox, uint32_t flags, uint32_t argb, int id, float exp) |
2321 | 0 | { |
2322 | 0 | fz_stext_block *b; |
2323 | |
|
2324 | 0 | if (exp != 0) |
2325 | 0 | { |
2326 | 0 | bbox.x0 -= exp; |
2327 | 0 | bbox.y0 -= exp; |
2328 | 0 | bbox.x1 += exp; |
2329 | 0 | bbox.y1 += exp; |
2330 | 0 | } |
2331 | |
|
2332 | 0 | if (tdev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP)) |
2333 | 0 | { |
2334 | 0 | fz_rect r = current_clip(ctx, tdev); |
2335 | 0 | bbox = fz_intersect_rect(bbox, r); |
2336 | 0 | if (!fz_is_valid_rect(bbox)) |
2337 | 0 | return; |
2338 | 0 | } |
2339 | | |
2340 | | /* Can we just add this one onto the previous one? */ |
2341 | | /* Only if it's a small rectangle... */ |
2342 | 0 | if ((flags & FZ_STEXT_VECTOR_IS_RECTANGLE) && bbox.x1 - bbox.x0 <= 2 && bbox.y1 - bbox.y0 <= 2) |
2343 | 0 | { |
2344 | 0 | fz_stext_block *prev; |
2345 | | /* Find b = the previous block. */ |
2346 | 0 | if (tdev->flags & FZ_STEXT_LAZY_VECTORS) |
2347 | 0 | b = tdev->lazy_vectors_tail; |
2348 | 0 | else if (page->last_struct) |
2349 | 0 | b = page->last_struct->last_block; |
2350 | 0 | else |
2351 | 0 | b = page->last_block; |
2352 | |
|
2353 | 0 | if (b && b->type == FZ_STEXT_BLOCK_VECTOR && b->u.v.argb == argb && b->u.v.flags == flags) |
2354 | 0 | { |
2355 | | /* Maybe we can join it? */ |
2356 | 0 | float fudge = 0.001f; |
2357 | 0 | if (b->bbox.x0 == bbox.x0 && b->bbox.x1 == bbox.x1 && b->bbox.y1 + fudge >= bbox.y0 && b->bbox.y0 - fudge <= bbox.y1) |
2358 | 0 | { |
2359 | | /* Stacks vertically. */ |
2360 | 0 | b->bbox.y0 = fz_min(b->bbox.y0, bbox.y0); |
2361 | 0 | b->bbox.y1 = fz_max(b->bbox.y1, bbox.y1); |
2362 | 0 | return; |
2363 | 0 | } |
2364 | 0 | else if (b->bbox.y0 == bbox.y0 && b->bbox.y1 == bbox.y1 && b->bbox.x1 + fudge >= bbox.x0 && b->bbox.x0 - fudge <= bbox.x1) |
2365 | 0 | { |
2366 | | /* Stacks horizontally. */ |
2367 | 0 | b->bbox.x0 = fz_min(b->bbox.x0, bbox.x0); |
2368 | 0 | b->bbox.x1 = fz_max(b->bbox.x1, bbox.x1); |
2369 | 0 | return; |
2370 | 0 | } |
2371 | | |
2372 | | /* So, we can't add our new vector onto the previous one. But can we merge the 2 previous ones? */ |
2373 | | /* The intent here is that we allow a set of vector 'blocks' to be merged together, perhaps: |
2374 | | * ABC |
2375 | | * Then we allow another set to be merged together, perhaps DE: |
2376 | | * ABC |
2377 | | * DE |
2378 | | * Then when we get another block that can't be merged into DE (perhaps F): |
2379 | | * ABC |
2380 | | * DE |
2381 | | * F |
2382 | | * We'll consider ABC and DE for merging. Whatevever block that F ends up |
2383 | | * in later (maybe FGH): |
2384 | | * ABC |
2385 | | * DE |
2386 | | * FGH |
2387 | | * will be considered for merging later. We can always do this "exactly" (if the blocks |
2388 | | * line up precisely), but to do this 'lossily', we guard it with 'FUZZY_VECTORS'. |
2389 | | */ |
2390 | 0 | prev = b->prev; |
2391 | 0 | while (prev && prev->type == FZ_STEXT_BLOCK_VECTOR && (prev->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE)) |
2392 | 0 | { |
2393 | | /* Lossless merging. */ |
2394 | 0 | if (prev->bbox.x0 == b->bbox.x0 && prev->bbox.x1 == b->bbox.x1 && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1) |
2395 | 0 | { |
2396 | | /* Stacks exactly vertically. Very rarely hit. */ |
2397 | 0 | prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0); |
2398 | 0 | prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1); |
2399 | 0 | return; |
2400 | 0 | } |
2401 | 0 | else if (prev->bbox.y0 == b->bbox.y0 && prev->bbox.y1 == b->bbox.y1 && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1) |
2402 | 0 | { |
2403 | | /* Stacks horizontally. Very rarely hit. */ |
2404 | 0 | prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0); |
2405 | 0 | prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1); |
2406 | 0 | return; |
2407 | 0 | } |
2408 | 0 | if (tdev->flags & FZ_STEXT_FUZZY_VECTORS) |
2409 | 0 | { |
2410 | | /* Be more forgiving in how we merge vectors */ |
2411 | | /* We need to be careful not to merge together differently oriented borders for table cells. |
2412 | | * C |
2413 | | * | |
2414 | | * v |
2415 | | * +-----+-----+ |
2416 | | * A-> | | | |
2417 | | * +-----+-----+ |
2418 | | * B-> | | | |
2419 | | * +-----+-----+ |
2420 | | * |
2421 | | * It'd be fine to merge borders A and B together, because it still signifies the same |
2422 | | * edges. It would NOT be fine to merge A and C together, because we'd lose the sense |
2423 | | * of them being borders, and just have a blob that covered the cell. |
2424 | | * The fudge2 logic below should hopefully allow for this, as well as allowing us to |
2425 | | * match blocks like: |
2426 | | * ABC |
2427 | | * DE FG |
2428 | | * HIJ |
2429 | | * KL MN |
2430 | | * OPQ |
2431 | | */ |
2432 | 0 | float fudge2 = 2; |
2433 | 0 | if ((fabsf(prev->bbox.x0 - b->bbox.x0) <= fudge2 || fabsf(prev->bbox.x1 - b->bbox.x1) <= fudge2) && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1) |
2434 | 0 | { |
2435 | | /* Stacks vertically. */ |
2436 | 0 | goto join; |
2437 | 0 | } |
2438 | 0 | else if ((fabsf(prev->bbox.y0 - b->bbox.y0) <= fudge2 || fabsf(prev->bbox.y1 - b->bbox.y1) <= fudge2) && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1) |
2439 | 0 | { |
2440 | | /* Stacks horizontally. */ |
2441 | 0 | join: |
2442 | 0 | prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0); |
2443 | 0 | prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1); |
2444 | 0 | prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0); |
2445 | 0 | prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1); |
2446 | | /* Unlink b (so, fiddle with b->prev, which is not necessarily prev!) */ |
2447 | 0 | b->prev->next = NULL; |
2448 | 0 | if (tdev->flags & FZ_STEXT_LAZY_VECTORS) |
2449 | 0 | tdev->lazy_vectors_tail = b->prev; |
2450 | 0 | else if (page->last_struct) |
2451 | 0 | page->last_struct->last_block = b->prev; |
2452 | 0 | else |
2453 | 0 | page->last_block = b->prev; |
2454 | 0 | break; |
2455 | 0 | } |
2456 | 0 | } |
2457 | | /* Now, allow for looking further back. */ |
2458 | 0 | prev = prev->prev; |
2459 | 0 | } |
2460 | 0 | } |
2461 | 0 | } |
2462 | | |
2463 | 0 | if (tdev->flags & FZ_STEXT_LAZY_VECTORS) |
2464 | 0 | b = add_lazy_vector(ctx, page, tdev, id); |
2465 | 0 | else |
2466 | 0 | b = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_VECTOR, id); |
2467 | |
|
2468 | 0 | b->bbox = bbox; |
2469 | 0 | b->u.v.flags = flags; |
2470 | 0 | b->u.v.argb = argb; |
2471 | 0 | } |
2472 | | |
2473 | | typedef struct |
2474 | | { |
2475 | | fz_stext_device *dev; |
2476 | | fz_matrix ctm; |
2477 | | uint32_t argb; |
2478 | | uint32_t flags; |
2479 | | fz_stext_page *page; |
2480 | | fz_rect seg_bounds; |
2481 | | fz_rect leftovers; |
2482 | | fz_rect pending; |
2483 | | int count; |
2484 | | fz_point p[5]; |
2485 | | int id; |
2486 | | float exp; |
2487 | | } split_path_data; |
2488 | | |
2489 | | static void |
2490 | | maybe_rect(fz_context *ctx, split_path_data *sp) |
2491 | 0 | { |
2492 | 0 | int rect = 0; |
2493 | 0 | int i; |
2494 | 0 | fz_rect leftovers; |
2495 | |
|
2496 | 0 | if (sp->count >= 0) |
2497 | 0 | { |
2498 | 0 | if (sp->count == 3) |
2499 | 0 | { |
2500 | | /* Allow for "moveto A, lineto B, lineto A, close" */ |
2501 | 0 | if (feq(sp->p[0].x, sp->p[2].x) || feq(sp->p[0].y, sp->p[2].y)) |
2502 | 0 | sp->count = 2; |
2503 | 0 | } |
2504 | 0 | if (sp->count == 2) |
2505 | 0 | { |
2506 | 0 | if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y)) |
2507 | 0 | rect = 1; /* Count that as a rect */ |
2508 | 0 | } |
2509 | 0 | else if (sp->count == 4 || sp->count == 5) |
2510 | 0 | { |
2511 | 0 | if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y)) |
2512 | 0 | rect = 1; |
2513 | 0 | else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y)) |
2514 | 0 | rect = 1; |
2515 | 0 | } |
2516 | 0 | if (rect) |
2517 | 0 | { |
2518 | 0 | fz_rect bounds; |
2519 | |
|
2520 | 0 | bounds.x0 = bounds.x1 = sp->p[0].x; |
2521 | 0 | bounds.y0 = bounds.y1 = sp->p[0].y; |
2522 | 0 | for (i = 1; i < sp->count; i++) |
2523 | 0 | bounds = fz_include_point_in_rect(bounds, sp->p[i]); |
2524 | 0 | if (fz_is_valid_rect(sp->pending)) |
2525 | 0 | add_vector(ctx, sp->page, sp->dev, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb, sp->id, sp->exp); |
2526 | 0 | sp->pending = bounds; |
2527 | 0 | return; |
2528 | 0 | } |
2529 | 0 | } |
2530 | | |
2531 | | /* We aren't a rectangle! */ |
2532 | 0 | leftovers = sp->seg_bounds; |
2533 | |
|
2534 | 0 | if (sp->dev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP)) |
2535 | 0 | leftovers = fz_intersect_rect(leftovers, current_clip(ctx, sp->dev)); |
2536 | |
|
2537 | 0 | if (fz_is_valid_rect(leftovers)) |
2538 | 0 | sp->leftovers = fz_union_rect(sp->leftovers, leftovers); |
2539 | | |
2540 | | /* Remember we're not a rect. */ |
2541 | 0 | sp->count = -1; |
2542 | 0 | } |
2543 | | |
2544 | | static void |
2545 | | split_move(fz_context *ctx, void *arg, float x, float y) |
2546 | 0 | { |
2547 | 0 | split_path_data *sp = (split_path_data *)arg; |
2548 | 0 | fz_point p = fz_transform_point_xy(x, y, sp->ctm); |
2549 | |
|
2550 | 0 | maybe_rect(ctx, sp); |
2551 | 0 | sp->p[0] = p; |
2552 | 0 | sp->count = 1; |
2553 | 0 | sp->seg_bounds.x0 = sp->seg_bounds.x1 = p.x; |
2554 | 0 | sp->seg_bounds.y0 = sp->seg_bounds.y1 = p.y; |
2555 | 0 | } |
2556 | | |
2557 | | static void |
2558 | | split_line(fz_context *ctx, void *arg, float x, float y) |
2559 | 0 | { |
2560 | 0 | split_path_data *sp = (split_path_data *)arg; |
2561 | 0 | fz_point p = fz_transform_point_xy(x, y, sp->ctm); |
2562 | |
|
2563 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, p); |
2564 | |
|
2565 | 0 | if (sp->count >= 0) |
2566 | 0 | { |
2567 | | /* Check for lines to the same point. */ |
2568 | 0 | if (feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y)) |
2569 | 0 | return; |
2570 | | /* If we're still maybe a rect, just record the point. */ |
2571 | 0 | if (sp->count < 4) |
2572 | 0 | { |
2573 | 0 | sp->p[sp->count++] = p; |
2574 | 0 | return; |
2575 | 0 | } |
2576 | | /* Check for close line? */ |
2577 | 0 | if (sp->count == 4) |
2578 | 0 | { |
2579 | 0 | if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y)) |
2580 | 0 | { |
2581 | | /* We've just drawn a line back to the start point. */ |
2582 | | /* Needless saving of point, but it makes the logic |
2583 | | * easier elsewhere. */ |
2584 | 0 | sp->p[sp->count++] = p; |
2585 | 0 | return; |
2586 | 0 | } |
2587 | 0 | } |
2588 | | /* We can no longer be a rect. */ |
2589 | 0 | sp->count = -1; |
2590 | 0 | } |
2591 | 0 | } |
2592 | | |
2593 | | static void |
2594 | | split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3) |
2595 | 0 | { |
2596 | 0 | split_path_data *sp = (split_path_data *)arg; |
2597 | |
|
2598 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x1, y1, sp->ctm)); |
2599 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x2, y2, sp->ctm)); |
2600 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x3, y3, sp->ctm)); |
2601 | | |
2602 | | /* We can no longer be a rect. */ |
2603 | 0 | sp->count = -1; |
2604 | 0 | } |
2605 | | |
2606 | | static void |
2607 | | split_close(fz_context *ctx, void *arg) |
2608 | 0 | { |
2609 | 0 | split_path_data *sp = (split_path_data *)arg; |
2610 | |
|
2611 | 0 | maybe_rect(ctx, sp); |
2612 | 0 | sp->count = 0; |
2613 | 0 | } |
2614 | | |
2615 | | |
2616 | | static const |
2617 | | fz_path_walker split_path_rects = |
2618 | | { |
2619 | | split_move, |
2620 | | split_line, |
2621 | | split_curve, |
2622 | | split_close |
2623 | | }; |
2624 | | |
2625 | | static void |
2626 | | add_vectors_from_path(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, int stroke, float exp) |
2627 | 0 | { |
2628 | 0 | int have_leftovers; |
2629 | 0 | split_path_data sp; |
2630 | 0 | int id = tdev->id; |
2631 | |
|
2632 | 0 | sp.dev = tdev; |
2633 | 0 | sp.ctm = ctm; |
2634 | 0 | sp.argb = hexrgba_from_color(ctx, cs, color, alpha); |
2635 | 0 | sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0; |
2636 | 0 | sp.page = page; |
2637 | 0 | sp.count = 0; |
2638 | 0 | sp.leftovers = fz_empty_rect; |
2639 | 0 | sp.seg_bounds = fz_empty_rect; |
2640 | 0 | sp.pending = fz_empty_rect; |
2641 | 0 | sp.id = id; |
2642 | 0 | sp.exp = exp; |
2643 | 0 | fz_walk_path(ctx, path, &split_path_rects, &sp); |
2644 | |
|
2645 | 0 | have_leftovers = fz_is_valid_rect(sp.leftovers); |
2646 | |
|
2647 | 0 | maybe_rect(ctx, &sp); |
2648 | |
|
2649 | 0 | if (fz_is_valid_rect(sp.pending)) |
2650 | 0 | add_vector(ctx, page, sp.dev, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb, id, exp); |
2651 | 0 | if (have_leftovers) |
2652 | 0 | add_vector(ctx, page, sp.dev, sp.leftovers, sp.flags, sp.argb, id, exp); |
2653 | 0 | } |
2654 | | |
2655 | | static void |
2656 | | fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp) |
2657 | 0 | { |
2658 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2659 | 0 | fz_stext_page *page = tdev->page; |
2660 | 0 | fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm); |
2661 | 0 | fz_rect *bounds = actualtext_bounds(tdev); |
2662 | | |
2663 | | /* If we're in an actualtext, then update the bounds to include this content. */ |
2664 | 0 | if (bounds != NULL) |
2665 | 0 | *bounds = fz_union_rect(*bounds, path_bounds); |
2666 | |
|
2667 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_STYLES) |
2668 | 0 | check_for_strikeout(ctx, tdev, page, path, ctm); |
2669 | |
|
2670 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_VECTORS) |
2671 | 0 | add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, 0, 0); |
2672 | 0 | } |
2673 | | |
2674 | | static void |
2675 | | fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp) |
2676 | 0 | { |
2677 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2678 | 0 | fz_stext_page *page = tdev->page; |
2679 | 0 | fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm); |
2680 | 0 | fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev); |
2681 | 0 | float exp = ss->linewidth / 2; |
2682 | | |
2683 | | /* If we're in an actualtext, then update the bounds to include this content. */ |
2684 | 0 | if (bounds != NULL) |
2685 | 0 | *bounds = fz_union_rect(*bounds, path_bounds); |
2686 | |
|
2687 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_STYLES) |
2688 | 0 | check_for_strikeout(ctx, tdev, page, path, ctm); |
2689 | |
|
2690 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_VECTORS) |
2691 | 0 | add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, 1, exp); |
2692 | 0 | } |
2693 | | |
2694 | | static void |
2695 | | new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw) |
2696 | 0 | { |
2697 | 0 | fz_stext_struct *str; |
2698 | 0 | size_t z; |
2699 | |
|
2700 | 0 | if (raw == NULL) |
2701 | 0 | raw = ""; |
2702 | 0 | z = strlen(raw); |
2703 | |
|
2704 | 0 | str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1); |
2705 | 0 | str->first_block = NULL; |
2706 | 0 | str->last_block = NULL; |
2707 | 0 | str->standard = standard; |
2708 | 0 | str->parent = page->last_struct; |
2709 | 0 | str->up = block; |
2710 | 0 | memcpy(str->raw, raw, z+1); |
2711 | |
|
2712 | 0 | block->u.s.down = str; |
2713 | 0 | } |
2714 | | |
2715 | | fz_stext_block * |
2716 | | fz_new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_structure standard, const char *raw, int idx) |
2717 | 0 | { |
2718 | 0 | fz_stext_block *block; |
2719 | |
|
2720 | 0 | block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
2721 | 0 | block->bbox = fz_empty_rect; |
2722 | 0 | block->prev = NULL; |
2723 | 0 | block->next = NULL; |
2724 | 0 | block->type = FZ_STEXT_BLOCK_STRUCT; |
2725 | 0 | block->u.s.index = idx; |
2726 | 0 | block->u.s.down = NULL; |
2727 | | /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */ |
2728 | 0 | new_stext_struct(ctx, page, block, standard, raw); |
2729 | |
|
2730 | 0 | return block; |
2731 | 0 | } |
2732 | | |
2733 | | |
2734 | | static void |
2735 | | fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx) |
2736 | 0 | { |
2737 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2738 | 0 | fz_stext_page *page = tdev->page; |
2739 | 0 | fz_stext_block *block, *le, *gt, *newblock; |
2740 | |
|
2741 | 0 | if (raw == NULL) |
2742 | 0 | raw = ""; |
2743 | | |
2744 | | /* Find a pointer to the last block. */ |
2745 | 0 | if (page->last_block) |
2746 | 0 | { |
2747 | 0 | block = page->last_block; |
2748 | 0 | } |
2749 | 0 | else if (page->last_struct) |
2750 | 0 | { |
2751 | 0 | block = page->last_struct->last_block; |
2752 | 0 | } |
2753 | 0 | else |
2754 | 0 | { |
2755 | 0 | block = page->first_block; |
2756 | 0 | } |
2757 | | |
2758 | | /* So block is somewhere in the content chain. Let's try and find: |
2759 | | * le = the struct node <= idx before block in the content chain. |
2760 | | * ge = the struct node >= idx after block in the content chain. |
2761 | | * Search backwards to start with. |
2762 | | */ |
2763 | 0 | gt = NULL; |
2764 | 0 | le = block; |
2765 | 0 | while (le) |
2766 | 0 | { |
2767 | 0 | if (le->type == FZ_STEXT_BLOCK_STRUCT) |
2768 | 0 | { |
2769 | 0 | if (le->u.s.index > idx) |
2770 | 0 | gt = le; |
2771 | 0 | if (le->u.s.index <= idx) |
2772 | 0 | break; |
2773 | 0 | } |
2774 | 0 | le = le->prev; |
2775 | 0 | } |
2776 | | /* The following loop copes with finding gt (the smallest block with an index higher |
2777 | | * than we want) if we haven't found it already. The while loop in here was designed |
2778 | | * to cope with 'block' being in the middle of a list. In fact, the way the code is |
2779 | | * currently, block will always be at the end of a list, so the while won't do anything. |
2780 | | * But I'm loathe to remove it in case we ever change this code to start from wherever |
2781 | | * we did the last insertion. */ |
2782 | 0 | if (gt == NULL) |
2783 | 0 | { |
2784 | 0 | gt = block; |
2785 | 0 | while (gt) |
2786 | 0 | { |
2787 | 0 | if (gt->type == FZ_STEXT_BLOCK_STRUCT) |
2788 | 0 | { |
2789 | 0 | if (gt->u.s.index <= idx) |
2790 | 0 | le = gt; |
2791 | 0 | if (gt->u.s.index >= idx) |
2792 | 0 | break; |
2793 | 0 | } |
2794 | 0 | block = gt; |
2795 | 0 | gt = gt->next; |
2796 | 0 | } |
2797 | 0 | } |
2798 | |
|
2799 | 0 | if (le && le->u.s.index == idx) |
2800 | 0 | { |
2801 | | /* We want to move down into the le block. Does it have a struct |
2802 | | * attached yet? */ |
2803 | 0 | if (le->u.s.down == NULL) |
2804 | 0 | { |
2805 | | /* No. We need to create a new struct node. */ |
2806 | 0 | new_stext_struct(ctx, page, le, standard, raw); |
2807 | 0 | } |
2808 | 0 | else if (le->u.s.down->standard != standard || strcmp(raw, le->u.s.down->raw) != 0) |
2809 | 0 | { |
2810 | | /* Yes, but it doesn't match the one we expect! */ |
2811 | 0 | fz_warn(ctx, "Mismatched structure type!"); |
2812 | 0 | } |
2813 | 0 | page->last_struct = le->u.s.down; |
2814 | 0 | page->last_block = le->u.s.down->last_block; |
2815 | |
|
2816 | 0 | return; |
2817 | 0 | } |
2818 | | |
2819 | | /* We are going to need to create a new block. Create a complete unlinked one here. */ |
2820 | 0 | newblock = fz_new_stext_struct(ctx, page, standard, raw, idx); |
2821 | | |
2822 | | /* So now we just need to link it in somewhere. */ |
2823 | 0 | if (gt) |
2824 | 0 | { |
2825 | | /* Link it in before gt. */ |
2826 | 0 | newblock->prev = gt->prev; |
2827 | 0 | if (gt->prev) |
2828 | 0 | gt->prev->next = newblock; |
2829 | 0 | else if (page->last_struct) |
2830 | 0 | { |
2831 | | /* We're linking it in at the start under another struct! */ |
2832 | 0 | assert(page->last_struct->first_block == gt); |
2833 | 0 | assert(page->last_struct->last_block != NULL); |
2834 | 0 | page->last_struct->first_block = newblock; |
2835 | 0 | } |
2836 | 0 | else |
2837 | 0 | { |
2838 | | /* We're linking it in at the start of the page! */ |
2839 | 0 | assert(page->first_block == gt); |
2840 | 0 | page->first_block = newblock; |
2841 | 0 | } |
2842 | 0 | gt->prev = newblock; |
2843 | 0 | newblock->next = gt; |
2844 | 0 | newblock->id = gt->id; |
2845 | 0 | } |
2846 | 0 | else if (block) |
2847 | 0 | { |
2848 | | /* Link it in at the end of the list (i.e. after 'block') */ |
2849 | 0 | newblock->prev = block; |
2850 | 0 | block->next = newblock; |
2851 | 0 | if (page->last_struct) |
2852 | 0 | { |
2853 | 0 | assert(page->last_struct->last_block == block); |
2854 | 0 | page->last_struct->last_block = newblock; |
2855 | 0 | } |
2856 | 0 | else |
2857 | 0 | { |
2858 | 0 | assert(page->last_block == block); |
2859 | 0 | page->last_block = newblock; |
2860 | 0 | } |
2861 | 0 | newblock->id = block->id; |
2862 | 0 | } |
2863 | 0 | else if (page->last_struct) |
2864 | 0 | { |
2865 | | /* We have no blocks at all at this level. */ |
2866 | 0 | page->last_struct->first_block = newblock; |
2867 | 0 | page->last_struct->last_block = newblock; |
2868 | 0 | newblock->id = page->last_struct->up->id; |
2869 | 0 | } |
2870 | 0 | else |
2871 | 0 | { |
2872 | | /* We have no blocks at ANY level. */ |
2873 | 0 | page->first_block = newblock; |
2874 | | /* newblock will have an id of 0. Best we can do. */ |
2875 | 0 | } |
2876 | | /* Wherever we linked it in, that's where we want to continue adding content. */ |
2877 | 0 | page->last_struct = newblock->u.s.down; |
2878 | 0 | page->last_block = NULL; |
2879 | 0 | } |
2880 | | |
2881 | | static void |
2882 | | fz_stext_end_structure(fz_context *ctx, fz_device *dev) |
2883 | 0 | { |
2884 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2885 | 0 | fz_stext_page *page = tdev->page; |
2886 | 0 | fz_stext_struct *str = page->last_struct; |
2887 | |
|
2888 | 0 | if (str == NULL) |
2889 | 0 | { |
2890 | 0 | fz_warn(ctx, "Structure out of sync"); |
2891 | 0 | return; |
2892 | 0 | } |
2893 | | |
2894 | 0 | page->last_struct = str->parent; |
2895 | 0 | if (page->last_struct == NULL) |
2896 | 0 | { |
2897 | 0 | page->last_block = page->first_block; |
2898 | | /* Yuck */ |
2899 | 0 | while (page->last_block->next) |
2900 | 0 | page->last_block = page->last_block->next; |
2901 | 0 | } |
2902 | 0 | else |
2903 | 0 | { |
2904 | 0 | page->last_block = page->last_struct->last_block; |
2905 | 0 | } |
2906 | 0 | } |
2907 | | |
2908 | | fz_device * |
2909 | | fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) |
2910 | 0 | { |
2911 | 0 | return fz_new_stext_device_for_page(ctx, page, opts, 0, 0, fz_empty_rect); |
2912 | 0 | } |
2913 | | |
2914 | | fz_device * |
2915 | | fz_new_stext_device_for_page(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts, int chapter_num, int page_num, fz_rect mediabox) |
2916 | 0 | { |
2917 | 0 | fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); |
2918 | |
|
2919 | 0 | dev->super.close_device = fz_stext_close_device; |
2920 | 0 | dev->super.drop_device = fz_stext_drop_device; |
2921 | |
|
2922 | 0 | dev->super.fill_text = fz_stext_fill_text; |
2923 | 0 | dev->super.stroke_text = fz_stext_stroke_text; |
2924 | 0 | dev->super.clip_text = fz_stext_clip_text; |
2925 | 0 | dev->super.clip_stroke_text = fz_stext_clip_stroke_text; |
2926 | 0 | dev->super.ignore_text = fz_stext_ignore_text; |
2927 | 0 | dev->super.begin_metatext = fz_stext_begin_metatext; |
2928 | 0 | dev->super.end_metatext = fz_stext_end_metatext; |
2929 | |
|
2930 | 0 | dev->super.fill_shade = fz_stext_fill_shade; |
2931 | 0 | dev->super.fill_image = fz_stext_fill_image; |
2932 | 0 | dev->super.fill_image_mask = fz_stext_fill_image_mask; |
2933 | |
|
2934 | 0 | if (opts) |
2935 | 0 | { |
2936 | 0 | dev->flags = opts->flags; |
2937 | 0 | if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE) |
2938 | 0 | { |
2939 | 0 | dev->super.begin_structure = fz_stext_begin_structure; |
2940 | 0 | dev->super.end_structure = fz_stext_end_structure; |
2941 | 0 | } |
2942 | 0 | if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES)) |
2943 | 0 | { |
2944 | 0 | dev->super.fill_path = fz_stext_fill_path; |
2945 | 0 | dev->super.stroke_path = fz_stext_stroke_path; |
2946 | 0 | } |
2947 | 0 | } |
2948 | 0 | dev->page = page; |
2949 | 0 | dev->pen.x = 0; |
2950 | 0 | dev->pen.y = 0; |
2951 | 0 | dev->trm = fz_identity; |
2952 | 0 | dev->lastchar = ' '; |
2953 | 0 | dev->lastline = NULL; |
2954 | 0 | dev->lasttext = NULL; |
2955 | 0 | dev->lastbidi = 0; |
2956 | 0 | dev->last_was_fake_bold = 1; |
2957 | 0 | if (opts) |
2958 | 0 | dev->opts = *opts; |
2959 | | |
2960 | | /* If we are ignoring images, then it'd be nice to skip the decode costs. BUT we still need them to tell |
2961 | | * us the bounds for ActualText, so we can only actually skip them if we are ignoring actualtext too. */ |
2962 | 0 | if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && (dev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0) |
2963 | 0 | dev->super.hints |= FZ_DONT_DECODE_IMAGES; |
2964 | |
|
2965 | 0 | dev->rect_max = 0; |
2966 | 0 | dev->rect_len = 0; |
2967 | 0 | dev->rects = NULL; |
2968 | | |
2969 | | /* Push a new id */ |
2970 | 0 | fz_try(ctx) |
2971 | 0 | { |
2972 | 0 | fz_stext_page_details *deets; |
2973 | 0 | size_t id; |
2974 | 0 | deets = fz_pool_array_append(ctx, page->id_list, &id); |
2975 | 0 | dev->id = (int)id; |
2976 | 0 | deets->mediabox = mediabox; |
2977 | 0 | deets->chapter = chapter_num; |
2978 | 0 | deets->page = page_num; |
2979 | 0 | } |
2980 | 0 | fz_catch(ctx) |
2981 | 0 | { |
2982 | 0 | fz_free(ctx, dev); |
2983 | 0 | fz_rethrow(ctx); |
2984 | 0 | } |
2985 | | |
2986 | 0 | page->mediabox = fz_union_rect(page->mediabox, mediabox); |
2987 | |
|
2988 | 0 | return (fz_device*)dev; |
2989 | 0 | } |