/src/mupdf/source/fitz/stext-device.c
Line | Count | Source |
1 | | // Copyright (C) 2004-2026 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // You should have received a copy of the GNU Affero General Public License |
15 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
16 | | // |
17 | | // Alternative licensing terms are available from the licensor. |
18 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
19 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
20 | | // CA 94129, USA, for further information. |
21 | | |
22 | | #include "mupdf/fitz.h" |
23 | | |
24 | | #include "glyphbox.h" |
25 | | |
26 | | #include "mupdf/ucdn.h" |
27 | | |
28 | | #include <float.h> |
29 | | #include <string.h> |
30 | | |
31 | | /* Simple layout structure */ |
32 | | |
33 | | fz_layout_block *fz_new_layout(fz_context *ctx) |
34 | 0 | { |
35 | 0 | fz_pool *pool = fz_new_pool(ctx); |
36 | 0 | fz_layout_block *block; |
37 | 0 | fz_try(ctx) |
38 | 0 | { |
39 | 0 | block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block)); |
40 | 0 | block->pool = pool; |
41 | 0 | block->head = NULL; |
42 | 0 | block->tailp = &block->head; |
43 | 0 | } |
44 | 0 | fz_catch(ctx) |
45 | 0 | { |
46 | 0 | fz_drop_pool(ctx, pool); |
47 | 0 | fz_rethrow(ctx); |
48 | 0 | } |
49 | 0 | return block; |
50 | 0 | } |
51 | | |
52 | | void fz_drop_layout(fz_context *ctx, fz_layout_block *block) |
53 | 0 | { |
54 | 0 | if (block) |
55 | 0 | fz_drop_pool(ctx, block->pool); |
56 | 0 | } |
57 | | |
58 | | void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p) |
59 | 0 | { |
60 | 0 | fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line)); |
61 | 0 | line->x = x; |
62 | 0 | line->y = y; |
63 | 0 | line->font_size = font_size; |
64 | 0 | line->p = p; |
65 | 0 | line->text = NULL; |
66 | 0 | line->next = NULL; |
67 | 0 | *block->tailp = line; |
68 | 0 | block->tailp = &line->next; |
69 | 0 | block->text_tailp = &line->text; |
70 | 0 | } |
71 | | |
72 | | void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p) |
73 | 0 | { |
74 | 0 | fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char)); |
75 | 0 | ch->x = x; |
76 | 0 | ch->advance = advance; |
77 | 0 | ch->p = p; |
78 | 0 | ch->next = NULL; |
79 | 0 | *block->text_tailp = ch; |
80 | 0 | block->text_tailp = &ch->next; |
81 | 0 | } |
82 | | |
83 | | /* Extract text into blocks and lines. */ |
84 | | |
85 | 0 | #define PARAGRAPH_DIST 1.5f |
86 | 0 | #define SPACE_DIST 0.15f |
87 | 0 | #define SPACE_MAX_DIST 0.8f |
88 | 0 | #define BASE_MAX_DIST 0.8f |
89 | 0 | #define FAKE_BOLD_MAX_DIST 0.1f |
90 | | |
91 | | /* We keep a stack of the different metatexts that apply at any |
92 | | * given point (normally none!). Whenever we get some content |
93 | | * with a metatext in force, we really want to update the bounds |
94 | | * for that metatext. But running along the whole list each time |
95 | | * would be painful. So we just update the bounds for dev->metatext |
96 | | * and rely on metatext_bounds() propagating it upwards 'just in |
97 | | * time' for us to use metatexts other than the latest one. This |
98 | | * also means we need to propagate bounds upwards when we pop |
99 | | * a metatext. |
100 | | * |
101 | | * Why do we need bounds at all? Well, suppose we get: |
102 | | * /Span <</ActualText (c) >> BDC /Im0 Do EMC |
103 | | * Then where on the page do we put 'c' ? By collecting the |
104 | | * bounds, we can place 'c' wherever the image was. |
105 | | */ |
106 | | typedef struct metatext_t |
107 | | { |
108 | | fz_metatext type; |
109 | | char *text; |
110 | | fz_rect bounds; |
111 | | struct metatext_t *prev; |
112 | | } metatext_t; |
113 | | |
114 | | typedef struct |
115 | | { |
116 | | fz_point from; |
117 | | fz_point to; |
118 | | float thickness; |
119 | | fz_rect rect; |
120 | | int argb; |
121 | | } rect_details; |
122 | | |
123 | | typedef struct |
124 | | { |
125 | | fz_device super; |
126 | | fz_stext_page *page; |
127 | | int id; |
128 | | fz_point pen, start; |
129 | | // maybe_bullet: True if the 'start' position recorded was done so after either some actualtext |
130 | | // on an image, or after a glyph that's known to be used for bullets. This is used to stop us |
131 | | // spotting an 'indented' paragraph, because it's possibly just a bulleted list. |
132 | | int maybe_bullet; |
133 | | fz_point lag_pen; |
134 | | fz_matrix trm; |
135 | | int lastchar; |
136 | | fz_stext_line *lastline; |
137 | | int lastbidi; |
138 | | int flags; |
139 | | int color; |
140 | | int last_was_fake_bold; |
141 | | const fz_text *lasttext; |
142 | | fz_stext_options opts; |
143 | | |
144 | | metatext_t *metatext; |
145 | | |
146 | | /* Store the last values we saw. We need this for flushing the actualtext. */ |
147 | | struct |
148 | | { |
149 | | int valid; |
150 | | int clipped; |
151 | | fz_matrix trm; |
152 | | int wmode; |
153 | | int bidi_level; |
154 | | fz_font *font; |
155 | | int flags; |
156 | | } last; |
157 | | |
158 | | /* The list of 'rects' seen during processing (if we're collecting styles). */ |
159 | | int rect_max; |
160 | | int rect_len; |
161 | | rect_details *rects; |
162 | | |
163 | | fz_stext_block *lazy_vectors; |
164 | | fz_stext_block *lazy_vectors_tail; |
165 | | } fz_stext_device; |
166 | | |
167 | | const char *fz_stext_options_usage = |
168 | | "Structured text options:\n" |
169 | | "\tpreserve-images: keep images in output\n" |
170 | | "\tpreserve-ligatures: do not expand ligatures into constituent characters\n" |
171 | | "\tpreserve-spans: do not merge spans on the same line\n" |
172 | | "\tpreserve-whitespace: do not convert all whitespace into space characters\n" |
173 | | "\tinhibit-spaces: don't add spaces between gaps in the text\n" |
174 | | "\tparagraph-break: break blocks at paragraph boundaries\n" |
175 | | "\tdehyphenate: attempt to join up hyphenated words\n" |
176 | | "\tignore-actualtext: do not apply ActualText replacements\n" |
177 | | "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n" |
178 | | "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n" |
179 | | "\taccurate-bboxes: calculate char bboxes from the outlines\n" |
180 | | "\taccurate-ascenders: calculate ascender/descender from font glyphs\n" |
181 | | "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n" |
182 | | "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n" |
183 | | "\tclip: do not include text that is completely clipped\n" |
184 | | "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n" |
185 | | "\tstructured: collect structure markup\n" |
186 | | "\tvectors: include vector bboxes in output\n" |
187 | | "\tlazy-vectors: delay vectors that would otherwise split a text line\n" |
188 | | "\tfuzzy-vectors: merge abutting horizontal/vertical vectors\n" |
189 | | "\tsegment: attempt to segment the page\n" |
190 | | "\ttable-hunt: hunt for tables within a (segmented) page\n" |
191 | | "\tresolution: resolution to render at\n" |
192 | | "\n"; |
193 | | |
194 | | /* Find the current actualtext, if any. Will abort if dev == NULL. */ |
195 | | static metatext_t * |
196 | | find_actualtext(fz_stext_device *dev) |
197 | 0 | { |
198 | 0 | metatext_t *mt = dev->metatext; |
199 | |
|
200 | 0 | while (mt && mt->type != FZ_METATEXT_ACTUALTEXT) |
201 | 0 | mt = mt->prev; |
202 | |
|
203 | 0 | return mt; |
204 | 0 | } |
205 | | |
206 | | /* Find the bounds of the given metatext. Will abort if mt or |
207 | | * dev are NULL. */ |
208 | | static fz_rect * |
209 | | metatext_bounds(metatext_t *mt, fz_stext_device *dev) |
210 | 0 | { |
211 | 0 | metatext_t *mt2 = dev->metatext; |
212 | |
|
213 | 0 | while (mt2 != mt) |
214 | 0 | { |
215 | 0 | mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds); |
216 | 0 | mt2 = mt2->prev; |
217 | 0 | } |
218 | |
|
219 | 0 | return &mt->bounds; |
220 | 0 | } |
221 | | |
222 | | /* Find the bounds of the current actualtext, or NULL if there |
223 | | * isn't one. Will abort if dev is NULL. */ |
224 | | static fz_rect * |
225 | | actualtext_bounds(fz_stext_device *dev) |
226 | 0 | { |
227 | 0 | metatext_t *mt = find_actualtext(dev); |
228 | |
|
229 | 0 | if (mt == NULL) |
230 | 0 | return NULL; |
231 | | |
232 | 0 | return metatext_bounds(mt, dev); |
233 | 0 | } |
234 | | |
235 | | fz_stext_page * |
236 | | fz_new_stext_page(fz_context *ctx, fz_rect mediabox) |
237 | 0 | { |
238 | 0 | fz_pool *pool = fz_new_pool(ctx); |
239 | 0 | fz_stext_page *page = NULL; |
240 | 0 | fz_try(ctx) |
241 | 0 | { |
242 | 0 | page = fz_pool_alloc(ctx, pool, sizeof(*page)); |
243 | 0 | page->refs = 1; |
244 | 0 | page->pool = pool; |
245 | 0 | page->mediabox = mediabox; |
246 | 0 | page->first_block = NULL; |
247 | 0 | page->last_block = NULL; |
248 | 0 | page->id_list = fz_new_pool_array(ctx, pool, fz_stext_page_details, 4); |
249 | 0 | } |
250 | 0 | fz_catch(ctx) |
251 | 0 | { |
252 | 0 | fz_drop_pool(ctx, pool); |
253 | 0 | fz_rethrow(ctx); |
254 | 0 | } |
255 | 0 | return page; |
256 | 0 | } |
257 | | |
258 | | static void |
259 | | drop_run(fz_context *ctx, fz_stext_block *block) |
260 | 0 | { |
261 | 0 | fz_stext_line *line; |
262 | 0 | fz_stext_char *ch; |
263 | 0 | while (block) |
264 | 0 | { |
265 | 0 | switch (block->type) |
266 | 0 | { |
267 | 0 | case FZ_STEXT_BLOCK_IMAGE: |
268 | 0 | fz_drop_image(ctx, block->u.i.image); |
269 | 0 | break; |
270 | 0 | case FZ_STEXT_BLOCK_TEXT: |
271 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
272 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
273 | 0 | fz_drop_font(ctx, ch->font); |
274 | 0 | break; |
275 | 0 | case FZ_STEXT_BLOCK_STRUCT: |
276 | 0 | drop_run(ctx, block->u.s.down->first_block); |
277 | 0 | break; |
278 | 0 | default: |
279 | 0 | break; |
280 | 0 | } |
281 | 0 | block = block->next; |
282 | 0 | } |
283 | 0 | } |
284 | | |
285 | | fz_stext_page_details *fz_stext_page_details_for_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block) |
286 | 0 | { |
287 | 0 | if (block == NULL || page == NULL) |
288 | 0 | fz_throw(ctx, FZ_ERROR_ARGUMENT, "page details require a page and a block"); |
289 | | |
290 | 0 | return (fz_stext_page_details *)fz_pool_array_lookup(ctx, page->id_list, block->id); |
291 | 0 | } |
292 | | |
293 | | fz_stext_page * |
294 | | fz_keep_stext_page(fz_context *ctx, fz_stext_page *page) |
295 | 0 | { |
296 | 0 | return fz_keep_imp(ctx, page, &page->refs); |
297 | 0 | } |
298 | | |
299 | | void |
300 | | fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) |
301 | 0 | { |
302 | 0 | if (page == NULL) |
303 | 0 | return; |
304 | | |
305 | 0 | if (fz_drop_imp(ctx, page, &page->refs)) |
306 | 0 | { |
307 | 0 | drop_run(ctx, page->first_block); |
308 | 0 | fz_drop_pool(ctx, page->pool); |
309 | 0 | } |
310 | 0 | } |
311 | | |
312 | | /* |
313 | | * This adds a new block at the end of the page. This should not be used |
314 | | * to add 'struct' blocks to the page as those have to be added internally, |
315 | | * with more complicated pointer setup. |
316 | | */ |
317 | | static fz_stext_block * |
318 | | add_block_to_page(fz_context *ctx, fz_stext_page *page, int type, int id) |
319 | 0 | { |
320 | 0 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
321 | 0 | block->bbox = fz_empty_rect; /* Fixes bug 703267. */ |
322 | 0 | block->prev = page->last_block; |
323 | 0 | block->type = type; |
324 | 0 | block->id = id; |
325 | 0 | if (page->last_struct) |
326 | 0 | { |
327 | 0 | if (page->last_struct->last_block) |
328 | 0 | { |
329 | 0 | block->prev = page->last_struct->last_block; |
330 | 0 | block->prev->next = block; |
331 | 0 | page->last_struct->last_block = block; |
332 | 0 | } |
333 | 0 | else |
334 | 0 | page->last_struct->last_block = page->last_struct->first_block = block; |
335 | 0 | } |
336 | 0 | else if (!page->last_block) |
337 | 0 | { |
338 | 0 | assert(!page->first_block); |
339 | 0 | page->first_block = page->last_block = block; |
340 | 0 | } |
341 | 0 | else |
342 | 0 | { |
343 | 0 | page->last_block->next = block; |
344 | 0 | page->last_block = block; |
345 | 0 | } |
346 | 0 | return block; |
347 | 0 | } |
348 | | |
349 | | static fz_stext_block * |
350 | | add_lazy_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, int id) |
351 | 0 | { |
352 | 0 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
353 | 0 | block->bbox = fz_empty_rect; |
354 | 0 | block->prev = tdev->lazy_vectors_tail; |
355 | 0 | block->type = FZ_STEXT_BLOCK_VECTOR; |
356 | 0 | block->id = id; |
357 | |
|
358 | 0 | if (tdev->lazy_vectors == NULL) |
359 | 0 | tdev->lazy_vectors = block; |
360 | 0 | else |
361 | 0 | tdev->lazy_vectors_tail->next = block; |
362 | 0 | tdev->lazy_vectors_tail = block; |
363 | |
|
364 | 0 | return block; |
365 | 0 | } |
366 | | |
367 | | static void |
368 | | flush_lazy_vectors(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev) |
369 | 0 | { |
370 | 0 | if (tdev->lazy_vectors == NULL) |
371 | 0 | return; |
372 | | |
373 | 0 | if (page->last_struct) |
374 | 0 | { |
375 | 0 | if (page->last_struct->last_block) |
376 | 0 | { |
377 | 0 | page->last_struct->last_block->next = tdev->lazy_vectors; |
378 | 0 | tdev->lazy_vectors->prev = page->last_struct->last_block; |
379 | 0 | page->last_struct->last_block = tdev->lazy_vectors_tail; |
380 | 0 | } |
381 | 0 | else |
382 | 0 | { |
383 | 0 | page->last_struct->first_block = tdev->lazy_vectors; |
384 | 0 | page->last_struct->last_block = tdev->lazy_vectors_tail; |
385 | 0 | } |
386 | 0 | } |
387 | 0 | else if (!page->last_block) |
388 | 0 | { |
389 | 0 | page->first_block = tdev->lazy_vectors; |
390 | 0 | page->last_block = tdev->lazy_vectors_tail; |
391 | 0 | } |
392 | 0 | else |
393 | 0 | { |
394 | 0 | page->last_block->next = tdev->lazy_vectors; |
395 | 0 | tdev->lazy_vectors->prev = page->last_block; |
396 | 0 | page->last_block = tdev->lazy_vectors_tail; |
397 | 0 | } |
398 | |
|
399 | 0 | tdev->lazy_vectors = tdev->lazy_vectors_tail = NULL; |
400 | 0 | } |
401 | | |
402 | | static fz_stext_block * |
403 | | add_text_block_to_page(fz_context *ctx, fz_stext_page *page, int id) |
404 | 0 | { |
405 | 0 | return add_block_to_page(ctx, page, FZ_STEXT_BLOCK_TEXT, id); |
406 | 0 | } |
407 | | |
408 | | static fz_stext_block * |
409 | | add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image, int id) |
410 | 0 | { |
411 | 0 | fz_stext_block *block = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_IMAGE, id); |
412 | 0 | block->u.i.transform = ctm; |
413 | 0 | block->u.i.image = fz_keep_image(ctx, image); |
414 | 0 | block->bbox = fz_transform_rect(fz_unit_rect, ctm); |
415 | 0 | return block; |
416 | 0 | } |
417 | | |
418 | | static fz_stext_line * |
419 | | add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi) |
420 | 0 | { |
421 | 0 | fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); |
422 | 0 | line->prev = block->u.t.last_line; |
423 | 0 | if (!block->u.t.first_line) |
424 | 0 | block->u.t.first_line = block->u.t.last_line = line; |
425 | 0 | else |
426 | 0 | { |
427 | 0 | block->u.t.last_line->next = line; |
428 | 0 | block->u.t.last_line = line; |
429 | 0 | } |
430 | |
|
431 | 0 | line->dir = *dir; |
432 | 0 | line->wmode = wmode; |
433 | |
|
434 | 0 | return line; |
435 | 0 | } |
436 | | |
437 | 0 | #define NON_ACCURATE_GLYPH_ADDED_SPACE (-2) |
438 | 0 | #define NON_ACCURATE_GLYPH (-1) |
439 | | |
440 | | static fz_stext_char * |
441 | | add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags) |
442 | 0 | { |
443 | 0 | fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); |
444 | 0 | fz_point a, d; |
445 | |
|
446 | 0 | if (!line->first_char) |
447 | 0 | line->first_char = line->last_char = ch; |
448 | 0 | else |
449 | 0 | { |
450 | 0 | line->last_char->next = ch; |
451 | 0 | line->last_char = ch; |
452 | 0 | } |
453 | |
|
454 | 0 | ch->c = c; |
455 | 0 | ch->argb = color; |
456 | 0 | ch->bidi = bidi; |
457 | 0 | ch->origin = *p; |
458 | 0 | ch->size = size; |
459 | 0 | ch->font = fz_keep_font(ctx, font); |
460 | 0 | ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0) | (synthetic > 1 ? FZ_STEXT_SYNTHETIC_LARGE : 0); |
461 | 0 | if (font->flags.is_bold) |
462 | 0 | ch->flags |= FZ_STEXT_BOLD; |
463 | |
|
464 | 0 | if (line->wmode == 0) |
465 | 0 | { |
466 | 0 | fz_rect bounds; |
467 | 0 | int bounded = 0; |
468 | 0 | a.x = 0; |
469 | 0 | d.x = 0; |
470 | 0 | if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE) |
471 | 0 | { |
472 | | /* Added space, in accurate mode. */ |
473 | 0 | a.y = d.y = 0; |
474 | 0 | } |
475 | 0 | else if (glyph == NON_ACCURATE_GLYPH) |
476 | 0 | { |
477 | | /* Non accurate mode. */ |
478 | 0 | a.y = fz_font_ascender(ctx, font); |
479 | 0 | d.y = fz_font_descender(ctx, font); |
480 | 0 | } |
481 | 0 | else |
482 | 0 | { |
483 | | /* Any glyph in accurate mode */ |
484 | 0 | bounds = fz_bound_glyph(ctx, font, glyph, fz_identity); |
485 | 0 | bounded = 1; |
486 | 0 | a.y = bounds.y1; |
487 | 0 | d.y = bounds.y0; |
488 | 0 | } |
489 | 0 | if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS) |
490 | 0 | { |
491 | 0 | if (!bounded) |
492 | 0 | bounds = fz_bound_glyph(ctx, font, glyph, fz_identity); |
493 | 0 | if (a.x > bounds.x0) |
494 | 0 | a.x = bounds.x0; |
495 | 0 | if (d.y < bounds.x1) |
496 | 0 | d.y = bounds.x1; |
497 | 0 | } |
498 | 0 | } |
499 | 0 | else |
500 | 0 | { |
501 | 0 | a.x = 1; |
502 | 0 | d.x = 0; |
503 | 0 | a.y = 0; |
504 | 0 | d.y = 0; |
505 | 0 | } |
506 | 0 | a = fz_transform_vector(a, trm); |
507 | 0 | d = fz_transform_vector(d, trm); |
508 | |
|
509 | 0 | ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y); |
510 | 0 | ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y); |
511 | 0 | ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y); |
512 | 0 | ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y); |
513 | |
|
514 | 0 | return ch; |
515 | 0 | } |
516 | | |
517 | | static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail) |
518 | 0 | { |
519 | 0 | fz_stext_char *prev, *next; |
520 | 0 | prev = tail; |
521 | 0 | while (curr != tail) |
522 | 0 | { |
523 | 0 | next = curr->next; |
524 | 0 | curr->next = prev; |
525 | 0 | prev = curr; |
526 | 0 | curr = next; |
527 | 0 | } |
528 | 0 | return prev; |
529 | 0 | } |
530 | | |
531 | | static void reverse_bidi_line(fz_stext_line *line) |
532 | 0 | { |
533 | 0 | fz_stext_char *a, *b, **prev; |
534 | 0 | prev = &line->first_char; |
535 | 0 | for (a = line->first_char; a; a = a->next) |
536 | 0 | { |
537 | 0 | if (a->bidi) |
538 | 0 | { |
539 | 0 | b = a; |
540 | 0 | while (b->next && b->next->bidi) |
541 | 0 | b = b->next; |
542 | 0 | if (a != b) |
543 | 0 | *prev = reverse_bidi_span(a, b->next); |
544 | 0 | } |
545 | 0 | prev = &a->next; |
546 | 0 | line->last_char = a; |
547 | 0 | } |
548 | 0 | } |
549 | | |
550 | | int fz_is_unicode_hyphen(int c) |
551 | 0 | { |
552 | | /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */ |
553 | 0 | return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011); |
554 | 0 | } |
555 | | |
556 | | static float |
557 | | vec_dot(const fz_point *a, const fz_point *b) |
558 | 0 | { |
559 | 0 | return a->x * b->x + a->y * b->y; |
560 | 0 | } |
561 | | |
562 | | static int may_add_space(int lastchar) |
563 | 0 | { |
564 | | /* Basic latin, greek, cyrillic, hebrew, arabic, |
565 | | * general punctuation, |
566 | | * superscripts and subscripts, |
567 | | * and currency symbols. |
568 | | */ |
569 | 0 | return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF))); |
570 | 0 | } |
571 | | |
572 | 0 | #define FAKEBOLD_THRESHOLD_RECIP (1.0f / FAKE_BOLD_MAX_DIST) |
573 | | |
574 | | static int |
575 | | is_within_fake_bold_distance(float a, float b, float size) |
576 | 0 | { |
577 | 0 | a -= b; |
578 | 0 | if (a < 0) |
579 | 0 | a = -a; |
580 | |
|
581 | 0 | return FAKEBOLD_THRESHOLD_RECIP * a < size; |
582 | 0 | } |
583 | | |
584 | | static int |
585 | | font_equiv(fz_context *ctx, fz_font *f, fz_font *g) |
586 | 0 | { |
587 | 0 | unsigned char fdigest[16]; |
588 | 0 | unsigned char gdigest[16]; |
589 | |
|
590 | 0 | if (f == g) |
591 | 0 | return 1; |
592 | | |
593 | 0 | if (strcmp(f->name, g->name) != 0) |
594 | 0 | return 0; |
595 | | |
596 | 0 | if (f->buffer == NULL || g->buffer == NULL) |
597 | 0 | return 0; |
598 | | |
599 | 0 | fz_font_digest(ctx, f, fdigest); |
600 | 0 | fz_font_digest(ctx, g, gdigest); |
601 | |
|
602 | 0 | return (memcmp(fdigest, gdigest, 16) == 0); |
603 | 0 | } |
604 | | |
605 | | static int |
606 | | check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags) |
607 | 0 | { |
608 | 0 | fz_stext_line *line; |
609 | 0 | fz_stext_char *ch; |
610 | |
|
611 | 0 | for (; block != NULL; block = block->next) |
612 | 0 | { |
613 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
614 | 0 | { |
615 | 0 | if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags)) |
616 | 0 | return 1; |
617 | 0 | } |
618 | 0 | else if (block->type == FZ_STEXT_BLOCK_TEXT) |
619 | 0 | { |
620 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
621 | 0 | { |
622 | 0 | fz_stext_char *pr = NULL; |
623 | 0 | for (ch = line->first_char; ch != NULL; ch = ch->next) |
624 | 0 | { |
625 | | /* Not perfect, but it'll do! */ |
626 | 0 | if (ch->c == c && is_within_fake_bold_distance(ch->origin.x, p.x, size) && is_within_fake_bold_distance(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font)) |
627 | 0 | { |
628 | | /* If we were filled before, and we are stroking now... */ |
629 | 0 | if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED && |
630 | 0 | (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED) |
631 | 0 | { |
632 | | /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */ |
633 | 0 | ch->flags |= flags; |
634 | 0 | return 1; |
635 | 0 | } |
636 | | /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these |
637 | | * as boldening if either the char before, or the char after were also boldened. */ |
638 | 0 | ch->flags |= flags; |
639 | |
|
640 | 0 | if (c == ' ') |
641 | 0 | { |
642 | 0 | if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) || |
643 | 0 | (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0)) |
644 | 0 | { |
645 | | /* OK, we can be bold. */ |
646 | 0 | ch->flags |= FZ_STEXT_BOLD; |
647 | 0 | } |
648 | | /* Whether we have recorded this as being bold or not, still |
649 | | * claim we did, so we swallow the space and don't reemit it. */ |
650 | 0 | return 1; |
651 | 0 | } |
652 | 0 | else |
653 | 0 | { |
654 | 0 | ch->flags |= FZ_STEXT_BOLD; |
655 | 0 | return 1; |
656 | 0 | } |
657 | 0 | } |
658 | 0 | pr = ch; |
659 | 0 | } |
660 | 0 | } |
661 | 0 | } |
662 | 0 | } |
663 | | |
664 | 0 | return 0; |
665 | 0 | } |
666 | | |
667 | | static int |
668 | | plausible_bullet(int c) |
669 | 0 | { |
670 | 0 | return (c == '*' || |
671 | 0 | c == 0x00B7 || /* Middle Dot */ |
672 | 0 | c == 0x2022 || /* Bullet */ |
673 | 0 | c == 0x2023 || /* Triangular Bullet */ |
674 | 0 | c == 0x2043 || /* Hyphen Bullet */ |
675 | 0 | c == 0x204C || /* Back leftwards bullet */ |
676 | 0 | c == 0x204D || /* Back rightwards bullet */ |
677 | 0 | c == 0x2219 || /* Bullet operator */ |
678 | 0 | c == 0x25C9 || /* Fisheye */ |
679 | 0 | c == 0x25CB || /* White circle */ |
680 | 0 | c == 0x25CF || /* Black circle */ |
681 | 0 | c == 0x25D8 || /* Inverse Bullet */ |
682 | 0 | c == 0x25E6 || /* White Bullet */ |
683 | 0 | c == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */ |
684 | 0 | c == 0x261a || /* Black left pointing index */ |
685 | 0 | c == 0x261b || /* Black right pointing index */ |
686 | 0 | c == 0x261c || /* White left pointing index */ |
687 | 0 | c == 0x261d || /* White up pointing index */ |
688 | 0 | c == 0x261e || /* White right pointing index */ |
689 | 0 | c == 0x261f || /* White down pointing index */ |
690 | 0 | c == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */ |
691 | 0 | c == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */ |
692 | 0 | c == 0x29BE || /* Circled White Bullet */ |
693 | 0 | c == 0x29BF || /* Circled Bullet */ |
694 | 0 | c == 0x2660 || /* Black Spade suit */ |
695 | 0 | c == 0x2661 || /* White Heart suit */ |
696 | 0 | c == 0x2662 || /* White Diamond suit */ |
697 | 0 | c == 0x2663 || /* Black Club suit */ |
698 | 0 | c == 0x2664 || /* White Spade suit */ |
699 | 0 | c == 0x2665 || /* Black Heart suit */ |
700 | 0 | c == 0x2666 || /* Black Diamond suit */ |
701 | 0 | c == 0x2667 || /* White Clud suit */ |
702 | 0 | c == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */ |
703 | 0 | c == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */ |
704 | 0 | c == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */ |
705 | 0 | c == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */ |
706 | 0 | c == 0x1f597 || /* White down pointing left hand index */ |
707 | 0 | c == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */ |
708 | 0 | c == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */ |
709 | 0 | c == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */ |
710 | 0 | c == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */ |
711 | 0 | c == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */ |
712 | 0 | c == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */ |
713 | 0 | c == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */ |
714 | 0 | c == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */ |
715 | 0 | c == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */ |
716 | 0 | c == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */ |
717 | 0 | c == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */ |
718 | 0 | c == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */ |
719 | 0 | c == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */ |
720 | 0 | c == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */ |
721 | 0 | c == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */ |
722 | 0 | c == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */ |
723 | 0 | (c >= '0' && c <= '9') || |
724 | 0 | 0); |
725 | 0 | } |
726 | | |
727 | | static void |
728 | | fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags) |
729 | 0 | { |
730 | 0 | fz_stext_page *page = dev->page; |
731 | 0 | fz_stext_block *cur_block; |
732 | 0 | fz_stext_line *cur_line = NULL; |
733 | |
|
734 | 0 | int new_para = 0; |
735 | 0 | int new_line = 1; |
736 | 0 | int add_space = 0; |
737 | 0 | fz_point dir, ndir, p, q; |
738 | 0 | float size; |
739 | 0 | fz_point delta; |
740 | 0 | float spacing = 0; |
741 | 0 | float base_offset = 0; |
742 | 0 | float dist; |
743 | | |
744 | | /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */ |
745 | 0 | bidi = bidi & 1; |
746 | | |
747 | | /* dir = direction vector for motion. ndir = normalised(dir) */ |
748 | 0 | if (wmode == 0) |
749 | 0 | { |
750 | 0 | dir.x = 1; |
751 | 0 | dir.y = 0; |
752 | 0 | } |
753 | 0 | else |
754 | 0 | { |
755 | 0 | dir.x = 0; |
756 | 0 | dir.y = -1; |
757 | 0 | } |
758 | 0 | dir = fz_transform_vector(dir, trm); |
759 | 0 | ndir = fz_normalize_vector(dir); |
760 | |
|
761 | 0 | size = fz_matrix_expansion(trm); |
762 | | |
763 | | /* We need to identify where glyphs 'start' (p) and 'stop' (q). |
764 | | * Each glyph holds its 'start' position, and the next glyph in the |
765 | | * span (or span->max if there is no next glyph) holds its 'end' |
766 | | * position. |
767 | | * |
768 | | * For both horizontal and vertical motion, trm->{e,f} gives the |
769 | | * origin (usually the bottom left) of the glyph. |
770 | | * |
771 | | * In horizontal mode: |
772 | | * + p is bottom left. |
773 | | * + q is the bottom right |
774 | | * In vertical mode: |
775 | | * + p is top left (where it advanced from) |
776 | | * + q is bottom left |
777 | | */ |
778 | 0 | if (wmode == 0) |
779 | 0 | { |
780 | 0 | p.x = trm.e; |
781 | 0 | p.y = trm.f; |
782 | 0 | q.x = trm.e + adv * dir.x; |
783 | 0 | q.y = trm.f + adv * dir.y; |
784 | 0 | } |
785 | 0 | else |
786 | 0 | { |
787 | 0 | p.x = trm.e - adv * dir.x; |
788 | 0 | p.y = trm.f - adv * dir.y; |
789 | 0 | q.x = trm.e; |
790 | 0 | q.y = trm.f; |
791 | 0 | } |
792 | | |
793 | | //printf("%g,%g \"%c\" %g,%g\n", p.x, p.y, c, q.x, q.y); |
794 | |
|
795 | 0 | if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0) |
796 | 0 | { |
797 | 0 | if (glyph < 0) |
798 | 0 | { |
799 | 0 | if (dev->last_was_fake_bold) |
800 | 0 | return; |
801 | 0 | } |
802 | 0 | else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags)) |
803 | 0 | { |
804 | 0 | dev->last_was_fake_bold = 1; |
805 | 0 | return; |
806 | 0 | } |
807 | 0 | dev->last_was_fake_bold = 0; |
808 | 0 | } |
809 | | |
810 | | /* Find current position to enter new text. */ |
811 | 0 | cur_block = page->last_struct ? page->last_struct->last_block : page->last_block; |
812 | 0 | if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) |
813 | 0 | cur_block = NULL; |
814 | 0 | cur_line = cur_block ? cur_block->u.t.last_line : NULL; |
815 | | |
816 | | /* We use glyph == -2 to indicate the first no-glyph char from an actualtext. The position |
817 | | * is valid though, so we want to advance the pen for these. */ |
818 | | |
819 | | /* Don't advance pen or break lines for either no-glyph or marking non-spacing characters in a cluster */ |
820 | 0 | if (cur_line && (glyph == -1 || ucdn_get_general_category(c) == UCDN_GENERAL_CATEGORY_MN)) |
821 | 0 | { |
822 | 0 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags); |
823 | 0 | dev->lastbidi = bidi; |
824 | 0 | dev->lastchar = c; |
825 | 0 | dev->lastline = cur_line; |
826 | 0 | return; |
827 | 0 | } |
828 | | |
829 | 0 | if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f) |
830 | 0 | { |
831 | | /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), |
832 | | * then we can't append to the current block/line. */ |
833 | 0 | new_para = 1; |
834 | 0 | new_line = 1; |
835 | 0 | } |
836 | 0 | else |
837 | 0 | { |
838 | | /* Detect fake bold where text is printed twice in the same place. */ |
839 | | /* Largely supplanted by the check_for_fake_bold mechanism above, |
840 | | * but we leave this in for backward compatibility as it's cheap, |
841 | | * and works even when FZ_STEXT_COLLECT_STYLES is not set. */ |
842 | 0 | dist = hypotf(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y) / size; |
843 | | /* This can trigger improperly for glyphs that come from actualtext |
844 | | * as they are frequently overlaid. Therefore rely on glyph >= 0. */ |
845 | 0 | if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar && glyph >= 0) |
846 | 0 | return; |
847 | | |
848 | | /* Calculate how far we've moved since the last character. */ |
849 | 0 | delta.x = p.x - dev->pen.x; |
850 | 0 | delta.y = p.y - dev->pen.y; |
851 | | |
852 | | /* The transform has not changed, so we know we're in the same |
853 | | * direction. Calculate 2 distances; how far off the previous |
854 | | * baseline we are, together with how far along the baseline |
855 | | * we are from the expected position. */ |
856 | 0 | spacing = (ndir.x * delta.x + ndir.y * delta.y) / size; |
857 | 0 | base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size; |
858 | | |
859 | | /* Only a small amount off the baseline - we'll take this */ |
860 | 0 | if (fabsf(base_offset) < BASE_MAX_DIST) |
861 | 0 | { |
862 | | /* If mixed LTR and RTL content */ |
863 | 0 | if ((bidi & 1) != (dev->lastbidi & 1)) |
864 | 0 | { |
865 | | /* Ignore jumps within line when switching between LTR and RTL text. */ |
866 | 0 | new_line = 0; |
867 | 0 | } |
868 | | |
869 | | /* RTL */ |
870 | 0 | else if (bidi & 1) |
871 | 0 | { |
872 | 0 | fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y); |
873 | 0 | float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv; |
874 | | |
875 | | /* If the pen is where we would have been if we |
876 | | * had advanced backwards from the previous |
877 | | * character by this character's advance, we |
878 | | * are probably seeing characters emitted in |
879 | | * logical order. |
880 | | */ |
881 | 0 | if (fabsf(logical_spacing) < SPACE_DIST) |
882 | 0 | { |
883 | 0 | new_line = 0; |
884 | 0 | } |
885 | | |
886 | | /* However, if the pen has advanced to where we would expect it |
887 | | * in an LTR context, we're seeing them emitted in visual order |
888 | | * and should flag them for reordering! |
889 | | */ |
890 | 0 | else if (fabsf(spacing) < SPACE_DIST) |
891 | 0 | { |
892 | 0 | bidi = 3; /* mark line as visual */ |
893 | 0 | new_line = 0; |
894 | 0 | } |
895 | | |
896 | | /* And any other small jump could be a missing space. */ |
897 | 0 | else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST) |
898 | 0 | { |
899 | 0 | if (wmode == 0 && may_add_space(dev->lastchar)) |
900 | 0 | add_space = 1; |
901 | 0 | new_line = 0; |
902 | 0 | } |
903 | 0 | else if (spacing < 0 && spacing > -SPACE_MAX_DIST) |
904 | 0 | { |
905 | | /* Motion is in line, but negative. We've probably got overlapping |
906 | | * chars here. Live with it. */ |
907 | 0 | new_line = 0; |
908 | 0 | } |
909 | 0 | else if (spacing > 0 && spacing < SPACE_MAX_DIST) |
910 | 0 | { |
911 | 0 | bidi = 3; /* mark line as visual */ |
912 | 0 | if (wmode == 0 && may_add_space(dev->lastchar)) |
913 | 0 | add_space = 1 + (spacing > SPACE_DIST*2); |
914 | 0 | new_line = 0; |
915 | 0 | } |
916 | | |
917 | 0 | else |
918 | 0 | { |
919 | | /* Motion is large and unexpected (probably a new table column). */ |
920 | 0 | new_line = 1; |
921 | 0 | } |
922 | 0 | } |
923 | | |
924 | | /* LTR or neutral character */ |
925 | 0 | else |
926 | 0 | { |
927 | 0 | if (fabsf(spacing) < SPACE_DIST) |
928 | 0 | { |
929 | | /* Motion is in line and small enough to ignore. */ |
930 | 0 | new_line = 0; |
931 | 0 | } |
932 | 0 | else if (spacing < 0 && spacing > -SPACE_MAX_DIST) |
933 | 0 | { |
934 | | /* Motion is in line, but negative. We've probably got overlapping |
935 | | * chars here. Live with it. */ |
936 | 0 | new_line = 0; |
937 | 0 | } |
938 | 0 | else if (spacing > 0 && spacing < SPACE_MAX_DIST) |
939 | 0 | { |
940 | | /* Motion is forward in line and large enough to warrant us adding a space. */ |
941 | 0 | if (wmode == 0 && may_add_space(dev->lastchar)) |
942 | 0 | add_space = 1 + (spacing > SPACE_DIST*2); |
943 | 0 | new_line = 0; |
944 | 0 | } |
945 | 0 | else |
946 | 0 | { |
947 | | /* Motion is large and unexpected (probably a new table column). */ |
948 | 0 | new_line = 1; |
949 | 0 | } |
950 | 0 | } |
951 | 0 | } |
952 | | |
953 | | /* Enough for a new line, but not enough for a new paragraph */ |
954 | 0 | else if (fabsf(base_offset) <= PARAGRAPH_DIST) |
955 | 0 | { |
956 | | /* Check indent to spot text-indent style paragraphs */ |
957 | 0 | if (wmode == 0 && cur_line) |
958 | 0 | if ((p.x - dev->start.x) > 0.5f && !dev->maybe_bullet) |
959 | 0 | new_para = 1; |
960 | 0 | new_line = 1; |
961 | 0 | } |
962 | | |
963 | | /* Way off the baseline - open a new paragraph */ |
964 | 0 | else |
965 | 0 | { |
966 | 0 | new_para = 1; |
967 | 0 | new_line = 1; |
968 | 0 | } |
969 | 0 | } |
970 | | |
971 | | /* Start a new block (but only at the beginning of a text object) */ |
972 | 0 | if (new_para || !cur_block) |
973 | 0 | { |
974 | 0 | flush_lazy_vectors(ctx, page, dev); |
975 | 0 | cur_block = add_text_block_to_page(ctx, page, dev->id); |
976 | 0 | cur_line = cur_block->u.t.last_line; |
977 | 0 | } |
978 | |
|
979 | 0 | if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(dev->lastchar) && dev->lastline != NULL) |
980 | 0 | dev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED; |
981 | | |
982 | | /* Start a new line */ |
983 | 0 | if (new_line || !cur_line || force_new_line) |
984 | 0 | { |
985 | 0 | cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi); |
986 | 0 | dev->start = p; |
987 | 0 | if (glyph == -2) |
988 | 0 | dev->maybe_bullet = 1; |
989 | 0 | else |
990 | 0 | dev->maybe_bullet = plausible_bullet(c); |
991 | 0 | } |
992 | | |
993 | | /* Henceforth treat such non-glyphs in the usual way. */ |
994 | 0 | if (glyph == -2) |
995 | 0 | glyph = -1; |
996 | | |
997 | | /* Add synthetic space */ |
998 | 0 | if (c != ' ' && add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES)) |
999 | 0 | add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, add_space, flags, dev->flags); |
1000 | |
|
1001 | 0 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags); |
1002 | |
|
1003 | 0 | dev->lastchar = c; |
1004 | 0 | dev->lastbidi = bidi; |
1005 | 0 | dev->lastline = cur_line; |
1006 | 0 | dev->lag_pen = p; |
1007 | 0 | dev->pen = q; |
1008 | |
|
1009 | 0 | dev->trm = trm; |
1010 | 0 | } |
1011 | | |
1012 | | static void |
1013 | | fz_add_stext_char(fz_context *ctx, |
1014 | | fz_stext_device *dev, |
1015 | | fz_font *font, |
1016 | | int c, |
1017 | | int glyph, |
1018 | | fz_matrix trm, |
1019 | | float adv, |
1020 | | int wmode, |
1021 | | int bidi, |
1022 | | int force_new_line, |
1023 | | int flags) |
1024 | 0 | { |
1025 | | /* ignore when one unicode character maps to multiple glyphs */ |
1026 | 0 | if (c == -1) |
1027 | 0 | return; |
1028 | | |
1029 | 0 | if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS) |
1030 | 0 | fz_calculate_font_ascender_descender(ctx, font); |
1031 | |
|
1032 | 0 | if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) |
1033 | 0 | { |
1034 | 0 | switch (c) |
1035 | 0 | { |
1036 | 0 | case 0xFB00: /* ff */ |
1037 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1038 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); |
1039 | 0 | return; |
1040 | 0 | case 0xFB01: /* fi */ |
1041 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1042 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags); |
1043 | 0 | return; |
1044 | 0 | case 0xFB02: /* fl */ |
1045 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1046 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags); |
1047 | 0 | return; |
1048 | 0 | case 0xFB03: /* ffi */ |
1049 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1050 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); |
1051 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags); |
1052 | 0 | return; |
1053 | 0 | case 0xFB04: /* ffl */ |
1054 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1055 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); |
1056 | 0 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags); |
1057 | 0 | return; |
1058 | 0 | case 0xFB05: /* long st */ |
1059 | 0 | case 0xFB06: /* st */ |
1060 | 0 | fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1061 | 0 | fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags); |
1062 | 0 | return; |
1063 | 0 | } |
1064 | | |
1065 | | /* alphabetic and arabic presentation forms */ |
1066 | 0 | if ((c >= 0xfb00 && c <= 0xfdff) || (c >= 0xfe70 && c <= 0xfefc)) |
1067 | 0 | { |
1068 | 0 | uint32_t lig[18]; |
1069 | 0 | int i, n = ucdn_compat_decompose(c, lig); |
1070 | 0 | fz_add_stext_char_imp(ctx, dev, font, lig[0], glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1071 | 0 | for (i = 1; i < n; ++i) |
1072 | 0 | fz_add_stext_char_imp(ctx, dev, font, lig[i], -1, trm, 0, wmode, bidi, 0, flags); |
1073 | 0 | return; |
1074 | 0 | } |
1075 | 0 | } |
1076 | | |
1077 | 0 | if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) |
1078 | 0 | { |
1079 | 0 | switch (c) |
1080 | 0 | { |
1081 | 0 | case 0x0009: /* tab */ |
1082 | 0 | case 0x0020: /* space */ |
1083 | 0 | case 0x00A0: /* no-break space */ |
1084 | 0 | case 0x1680: /* ogham space mark */ |
1085 | 0 | case 0x180E: /* mongolian vowel separator */ |
1086 | 0 | case 0x2000: /* en quad */ |
1087 | 0 | case 0x2001: /* em quad */ |
1088 | 0 | case 0x2002: /* en space */ |
1089 | 0 | case 0x2003: /* em space */ |
1090 | 0 | case 0x2004: /* three-per-em space */ |
1091 | 0 | case 0x2005: /* four-per-em space */ |
1092 | 0 | case 0x2006: /* six-per-em space */ |
1093 | 0 | case 0x2007: /* figure space */ |
1094 | 0 | case 0x2008: /* punctuation space */ |
1095 | 0 | case 0x2009: /* thin space */ |
1096 | 0 | case 0x200A: /* hair space */ |
1097 | 0 | case 0x202F: /* narrow no-break space */ |
1098 | 0 | case 0x205F: /* medium mathematical space */ |
1099 | 0 | case 0x3000: /* ideographic space */ |
1100 | 0 | c = ' '; |
1101 | 0 | } |
1102 | 0 | } |
1103 | |
|
1104 | 0 | fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags); |
1105 | 0 | } |
1106 | | |
1107 | | static fz_rect |
1108 | | current_clip(fz_context *ctx, fz_stext_device *dev) |
1109 | 0 | { |
1110 | 0 | fz_rect r = fz_infinite_rect; |
1111 | |
|
1112 | 0 | if (dev->flags & FZ_STEXT_CLIP) |
1113 | 0 | { |
1114 | 0 | r = fz_device_current_scissor(ctx, &dev->super); |
1115 | 0 | r = fz_intersect_rect(r, dev->page->mediabox); |
1116 | 0 | } |
1117 | 0 | if (dev->flags & FZ_STEXT_CLIP_RECT) |
1118 | 0 | r = fz_intersect_rect(r, dev->opts.clip); |
1119 | |
|
1120 | 0 | return r; |
1121 | 0 | } |
1122 | | |
1123 | | static void |
1124 | | do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags) |
1125 | 0 | { |
1126 | 0 | fz_font *font = span->font; |
1127 | 0 | fz_matrix tm = span->trm; |
1128 | 0 | float adv; |
1129 | 0 | int unicode; |
1130 | 0 | int i; |
1131 | |
|
1132 | 0 | for (i = start; i < end; i++) |
1133 | 0 | { |
1134 | 0 | if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) |
1135 | 0 | { |
1136 | 0 | fz_rect r = current_clip(ctx, dev); |
1137 | 0 | if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r)) |
1138 | 0 | { |
1139 | 0 | dev->last.clipped = 1; |
1140 | 0 | continue; |
1141 | 0 | } |
1142 | 0 | } |
1143 | 0 | dev->last.clipped = 0; |
1144 | | |
1145 | | /* Calculate new pen location and delta */ |
1146 | 0 | tm.e = span->items[i].x; |
1147 | 0 | tm.f = span->items[i].y; |
1148 | 0 | dev->last.trm = fz_concat(tm, ctm); |
1149 | 0 | dev->last.bidi_level = span->bidi_level; |
1150 | 0 | dev->last.wmode = span->wmode; |
1151 | 0 | if (font != dev->last.font) |
1152 | 0 | { |
1153 | 0 | fz_drop_font(ctx, dev->last.font); |
1154 | 0 | dev->last.font = fz_keep_font(ctx, font); |
1155 | 0 | } |
1156 | 0 | dev->last.valid = 1; |
1157 | 0 | dev->last.flags = flags; |
1158 | | |
1159 | | /* Calculate bounding box and new pen position based on font metrics */ |
1160 | 0 | if (span->items[i].gid >= 0) |
1161 | 0 | adv = span->items[i].adv; |
1162 | 0 | else |
1163 | 0 | adv = 0; |
1164 | |
|
1165 | 0 | unicode = span->items[i].ucs; |
1166 | 0 | if (unicode == FZ_REPLACEMENT_CHARACTER) |
1167 | 0 | { |
1168 | 0 | if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE) |
1169 | 0 | { |
1170 | 0 | unicode = span->items[i].cid; |
1171 | 0 | flags |= FZ_STEXT_UNICODE_IS_CID; |
1172 | 0 | } |
1173 | 0 | else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE) |
1174 | 0 | { |
1175 | 0 | unicode = span->items[i].gid; |
1176 | 0 | flags |= FZ_STEXT_UNICODE_IS_GID; |
1177 | 0 | } |
1178 | 0 | } |
1179 | | |
1180 | | /* Send the chars we have through. */ |
1181 | 0 | fz_add_stext_char(ctx, dev, font, |
1182 | 0 | unicode, |
1183 | 0 | span->items[i].gid, |
1184 | 0 | dev->last.trm, |
1185 | 0 | adv, |
1186 | 0 | dev->last.wmode, |
1187 | 0 | dev->last.bidi_level, |
1188 | 0 | (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), |
1189 | 0 | flags); |
1190 | 0 | } |
1191 | 0 | } |
1192 | | |
1193 | | static int |
1194 | | rune_index(const char *utf8, size_t idx) |
1195 | 0 | { |
1196 | 0 | int rune; |
1197 | |
|
1198 | 0 | do |
1199 | 0 | { |
1200 | 0 | int len = fz_chartorune(&rune, utf8); |
1201 | 0 | if (rune == 0) |
1202 | 0 | return -1; |
1203 | 0 | utf8 += len; |
1204 | 0 | } |
1205 | 0 | while (idx--); |
1206 | | |
1207 | 0 | return rune; |
1208 | 0 | } |
1209 | | |
1210 | | static void |
1211 | | flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i, int end) |
1212 | 0 | { |
1213 | 0 | int glyph = -2; |
1214 | |
|
1215 | 0 | if (*actualtext == 0) |
1216 | 0 | return; |
1217 | | |
1218 | 0 | if (!dev->last.valid) |
1219 | 0 | return; |
1220 | | |
1221 | 0 | if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) |
1222 | 0 | if (dev->last.clipped) |
1223 | 0 | return; |
1224 | | |
1225 | 0 | while (end < 0 || (end >= 0 && i < end)) |
1226 | 0 | { |
1227 | 0 | int rune; |
1228 | 0 | actualtext += fz_chartorune(&rune, actualtext); |
1229 | |
|
1230 | 0 | if (rune == 0) |
1231 | 0 | break; |
1232 | | |
1233 | 0 | fz_add_stext_char(ctx, dev, dev->last.font, |
1234 | 0 | rune, |
1235 | 0 | glyph, |
1236 | 0 | dev->last.trm, |
1237 | 0 | 0, |
1238 | 0 | dev->last.wmode, |
1239 | 0 | dev->last.bidi_level, |
1240 | 0 | (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), |
1241 | 0 | dev->last.flags); |
1242 | 0 | i++; |
1243 | |
|
1244 | 0 | glyph = -1; /* -1 for all but first glyph in the actualtext run */ |
1245 | 0 | } |
1246 | 0 | } |
1247 | | |
1248 | | static void |
1249 | | do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags) |
1250 | 0 | { |
1251 | | /* We are within an actualtext block. This means we can't just add the chars |
1252 | | * as they are. We need to add the chars as they are meant to be. Sadly the |
1253 | | * actualtext mechanism doesn't help us at all with positioning. */ |
1254 | 0 | fz_font *font = span->font; |
1255 | 0 | fz_matrix tm = span->trm; |
1256 | 0 | float adv; |
1257 | 0 | int start, i, end; |
1258 | 0 | char *actualtext = mt->text; |
1259 | 0 | size_t z = fz_utflen(actualtext); |
1260 | | |
1261 | | /* If actualtext is empty, nothing to do! */ |
1262 | 0 | if (z == 0) |
1263 | 0 | return; |
1264 | | |
1265 | | /* Now, we HOPE that the creator of a PDF will minimise the actual text |
1266 | | * differences, so that we'll get: |
1267 | | * "Politicians <ActualText="lie">fib</ActualText>, always." |
1268 | | * rather than: |
1269 | | * "<ActualText="Politicians lie, always">Politicians fib, always.</ActualText> |
1270 | | * but experience with PDF files tells us that this won't always be the case. |
1271 | | * |
1272 | | * We try to minimise the actualtext section here, just in case. |
1273 | | */ |
1274 | | |
1275 | | /* Spot a matching prefix and send it. */ |
1276 | 0 | for (start = 0; start < span->len; start++) |
1277 | 0 | { |
1278 | 0 | int rune; |
1279 | 0 | int len = fz_chartorune(&rune, actualtext); |
1280 | 0 | if (span->items[start].ucs != rune || rune == 0) |
1281 | 0 | break; |
1282 | 0 | actualtext += len; z--; |
1283 | 0 | } |
1284 | 0 | if (start != 0) |
1285 | 0 | do_extract(ctx, dev, span, ctm, 0, start, flags); |
1286 | |
|
1287 | 0 | if (start == span->len) |
1288 | 0 | { |
1289 | | /* The prefix has consumed all this object. Just shorten the actualtext and we'll |
1290 | | * catch the rest next time. */ |
1291 | 0 | z = strlen(actualtext)+1; |
1292 | 0 | memmove(mt->text, actualtext, z); |
1293 | 0 | return; |
1294 | 0 | } |
1295 | | |
1296 | | /* We haven't consumed the whole string, so there must be runes left. |
1297 | | * Shut coverity up. */ |
1298 | 0 | assert(z != 0); |
1299 | | |
1300 | | /* Spot a matching postfix. Can't send it til the end. */ |
1301 | 0 | for (end = span->len; end > start; end--) |
1302 | 0 | { |
1303 | | /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */ |
1304 | 0 | int rune = rune_index(actualtext, z-1); |
1305 | 0 | if (span->items[end-1].ucs != rune) |
1306 | 0 | break; |
1307 | 0 | z--; |
1308 | 0 | } |
1309 | | /* So we can send end -> span->len at the end. */ |
1310 | | |
1311 | | /* So we have at least SOME chars that don't match. */ |
1312 | | /* Now, do the difficult bit in the middle.*/ |
1313 | | /* items[start..end] have to be sent with actualtext[start..z] */ |
1314 | 0 | for (i = start; i < end; i++) |
1315 | 0 | { |
1316 | 0 | fz_text_item *item = &span->items[i]; |
1317 | 0 | int rune = -1; |
1318 | |
|
1319 | 0 | if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) |
1320 | 0 | { |
1321 | 0 | fz_rect r = current_clip(ctx, dev); |
1322 | 0 | if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r)) |
1323 | 0 | { |
1324 | 0 | dev->last.clipped = 1; |
1325 | 0 | continue; |
1326 | 0 | } |
1327 | 0 | } |
1328 | 0 | dev->last.clipped = 0; |
1329 | |
|
1330 | 0 | if ((size_t)i < z) |
1331 | 0 | actualtext += fz_chartorune(&rune, actualtext); |
1332 | | |
1333 | | /* Calculate new pen location and delta */ |
1334 | 0 | tm.e = item->x; |
1335 | 0 | tm.f = item->y; |
1336 | 0 | dev->last.trm = fz_concat(tm, ctm); |
1337 | 0 | dev->last.bidi_level = span->bidi_level; |
1338 | 0 | dev->last.wmode = span->wmode; |
1339 | 0 | if (font != dev->last.font) |
1340 | 0 | { |
1341 | 0 | fz_drop_font(ctx, dev->last.font); |
1342 | 0 | dev->last.font = fz_keep_font(ctx, font); |
1343 | 0 | } |
1344 | 0 | dev->last.valid = 1; |
1345 | 0 | dev->last.flags = flags; |
1346 | | |
1347 | | /* Calculate bounding box and new pen position based on font metrics */ |
1348 | 0 | if (item->gid >= 0) |
1349 | 0 | adv = item->adv; |
1350 | 0 | else |
1351 | 0 | adv = 0; |
1352 | |
|
1353 | 0 | fz_add_stext_char(ctx, dev, font, |
1354 | 0 | rune, |
1355 | 0 | span->items[i].gid, |
1356 | 0 | dev->last.trm, |
1357 | 0 | adv, |
1358 | 0 | dev->last.wmode, |
1359 | 0 | dev->last.bidi_level, |
1360 | 0 | (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), |
1361 | 0 | flags); |
1362 | 0 | } |
1363 | | |
1364 | | /* If we haven't spotted a postfix by this point, then don't force ourselves to output |
1365 | | * any more of the actualtext at this point. We might get a new text object that matches |
1366 | | * more of it. */ |
1367 | 0 | if (end == span->len) |
1368 | 0 | { |
1369 | | /* Shorten actualtext and exit. */ |
1370 | 0 | z = strlen(actualtext)+1; |
1371 | 0 | memmove(mt->text, actualtext, z); |
1372 | 0 | return; |
1373 | 0 | } |
1374 | | |
1375 | | /* if this is the first text on the page, and the actual text suffix matches the entire |
1376 | | * span text, then no font will have been set above, so set the last used font to the |
1377 | | * span font since flush_actualtext() assumes that a font has been set. |
1378 | | */ |
1379 | 0 | if (!dev->last.font) |
1380 | 0 | dev->last.font = fz_keep_font(ctx, font); |
1381 | | |
1382 | | /* We found a matching postfix. It seems likely that this is going to be the only |
1383 | | * text object we get, so send any remaining actualtext now. */ |
1384 | 0 | flush_actualtext(ctx, dev, actualtext, i, i + (int)strlen(actualtext) - (span->len - end)); |
1385 | | |
1386 | | /* Send the postfix */ |
1387 | 0 | if (end != span->len) |
1388 | 0 | do_extract(ctx, dev, span, ctm, end, span->len, flags); |
1389 | |
|
1390 | 0 | mt->text[0] = 0; |
1391 | 0 | } |
1392 | | |
1393 | | static void |
1394 | | fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags) |
1395 | 0 | { |
1396 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1397 | 0 | metatext_t *mt = NULL; |
1398 | |
|
1399 | 0 | if (span->len == 0) |
1400 | 0 | return; |
1401 | | |
1402 | | /* Are we in an actualtext? */ |
1403 | 0 | if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT)) |
1404 | 0 | mt = find_actualtext(dev); |
1405 | |
|
1406 | 0 | if (mt) |
1407 | 0 | do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags); |
1408 | 0 | else |
1409 | 0 | do_extract(ctx, dev, span, ctm, 0, span->len, flags); |
1410 | 0 | } |
1411 | | |
1412 | | static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha) |
1413 | 0 | { |
1414 | 0 | float rgb[3]; |
1415 | 0 | fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params); |
1416 | 0 | return |
1417 | 0 | (((uint32_t) fz_clampi(alpha * 255 + 0.5f, 0, 255)) << 24) | |
1418 | 0 | (((uint32_t) fz_clampi(rgb[0] * 255 + 0.5f, 0, 255)) << 16) | |
1419 | 0 | (((uint32_t) fz_clampi(rgb[1] * 255 + 0.5f, 0, 255)) << 8) | |
1420 | 0 | (((uint32_t) fz_clampi(rgb[2] * 255 + 0.5f, 0, 255))); |
1421 | 0 | } |
1422 | | |
1423 | | static void |
1424 | | fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, |
1425 | | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
1426 | 0 | { |
1427 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1428 | 0 | fz_text_span *span; |
1429 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1430 | 0 | return; |
1431 | 0 | tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha); |
1432 | 0 | for (span = text->head; span; span = span->next) |
1433 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED); |
1434 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1435 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1436 | 0 | } |
1437 | | |
1438 | | static void |
1439 | | fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, |
1440 | | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
1441 | 0 | { |
1442 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1443 | 0 | fz_text_span *span; |
1444 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1445 | 0 | return; |
1446 | 0 | tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha); |
1447 | 0 | for (span = text->head; span; span = span->next) |
1448 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED); |
1449 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1450 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1451 | 0 | } |
1452 | | |
1453 | | static void |
1454 | | fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor) |
1455 | 0 | { |
1456 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1457 | 0 | fz_text_span *span; |
1458 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1459 | 0 | return; |
1460 | 0 | tdev->color = 0; |
1461 | 0 | for (span = text->head; span; span = span->next) |
1462 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED); |
1463 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1464 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1465 | 0 | } |
1466 | | |
1467 | | static void |
1468 | | fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) |
1469 | 0 | { |
1470 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1471 | 0 | fz_text_span *span; |
1472 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1473 | 0 | return; |
1474 | 0 | tdev->color = 0; |
1475 | 0 | for (span = text->head; span; span = span->next) |
1476 | 0 | fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED); |
1477 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1478 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1479 | 0 | } |
1480 | | |
1481 | | static void |
1482 | | fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm) |
1483 | 0 | { |
1484 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1485 | 0 | fz_text_span *span; |
1486 | 0 | if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) |
1487 | 0 | return; |
1488 | 0 | tdev->color = 0; |
1489 | 0 | for (span = text->head; span; span = span->next) |
1490 | 0 | fz_stext_extract(ctx, tdev, span, ctm, 0); |
1491 | 0 | fz_drop_text(ctx, tdev->lasttext); |
1492 | 0 | tdev->lasttext = fz_keep_text(ctx, text); |
1493 | 0 | } |
1494 | | |
1495 | | static void |
1496 | | fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text) |
1497 | 0 | { |
1498 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1499 | 0 | metatext_t *mt = find_actualtext(tdev); |
1500 | 0 | char *new_text = NULL; |
1501 | |
|
1502 | 0 | if (mt != NULL && meta == FZ_METATEXT_ACTUALTEXT) |
1503 | 0 | flush_actualtext(ctx, tdev, mt->text, 0, -1); |
1504 | |
|
1505 | 0 | if (meta == FZ_METATEXT_ACTUALTEXT) |
1506 | 0 | tdev->last.valid = 0; |
1507 | |
|
1508 | 0 | new_text = text ? fz_strdup(ctx, text) : NULL; |
1509 | |
|
1510 | 0 | fz_try(ctx) |
1511 | 0 | { |
1512 | 0 | mt = fz_malloc_struct(ctx, metatext_t); |
1513 | |
|
1514 | 0 | mt->prev = tdev->metatext; |
1515 | 0 | tdev->metatext = mt; |
1516 | 0 | mt->type = meta; |
1517 | 0 | mt->text = new_text; |
1518 | 0 | mt->bounds = fz_empty_rect; |
1519 | 0 | } |
1520 | 0 | fz_catch(ctx) |
1521 | 0 | { |
1522 | 0 | fz_free(ctx, new_text); |
1523 | 0 | fz_rethrow(ctx); |
1524 | 0 | } |
1525 | 0 | } |
1526 | | |
1527 | | static void |
1528 | | pop_metatext(fz_context *ctx, fz_stext_device *dev) |
1529 | 0 | { |
1530 | 0 | metatext_t *prev; |
1531 | 0 | fz_rect bounds; |
1532 | |
|
1533 | 0 | if (!dev->metatext) |
1534 | 0 | return; |
1535 | | |
1536 | 0 | prev = dev->metatext->prev; |
1537 | 0 | bounds = dev->metatext->bounds; |
1538 | 0 | fz_free(ctx, dev->metatext->text); |
1539 | 0 | fz_free(ctx, dev->metatext); |
1540 | 0 | dev->metatext = prev; |
1541 | 0 | if (prev) |
1542 | 0 | prev->bounds = fz_union_rect(prev->bounds, bounds); |
1543 | 0 | } |
1544 | | |
1545 | | static void |
1546 | | fz_stext_end_metatext(fz_context *ctx, fz_device *dev) |
1547 | 0 | { |
1548 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1549 | 0 | fz_font *myfont = NULL; |
1550 | |
|
1551 | 0 | if (!tdev->metatext) |
1552 | 0 | return; /* Mismatched pop. Live with it. */ |
1553 | | |
1554 | 0 | if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT || (tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0) |
1555 | 0 | { |
1556 | | /* We only deal with ActualText here. Just pop anything else off, |
1557 | | * and we're done. */ |
1558 | 0 | pop_metatext(ctx, tdev); |
1559 | 0 | return; |
1560 | 0 | } |
1561 | | |
1562 | | /* If we have a 'last' text position, send the content after that. */ |
1563 | 0 | if (tdev->last.valid) |
1564 | 0 | { |
1565 | 0 | tdev->last.trm.e = tdev->pen.x; |
1566 | 0 | tdev->last.trm.f = tdev->pen.y; |
1567 | |
|
1568 | 0 | flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1); |
1569 | 0 | pop_metatext(ctx, tdev); |
1570 | 0 | tdev->last.valid = 0; |
1571 | 0 | return; |
1572 | 0 | } |
1573 | | |
1574 | | /* Unless we have collected a rectangle for content that encloses the actual text, |
1575 | | * we can't do anything. */ |
1576 | 0 | if (fz_is_empty_rect(tdev->metatext->bounds)) |
1577 | 0 | { |
1578 | 0 | if ((dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) == 0 && tdev->metatext->text[0]) |
1579 | 0 | fz_warn(ctx, "ActualText with no position. Text may be lost or mispositioned."); |
1580 | 0 | pop_metatext(ctx, tdev); |
1581 | 0 | return; |
1582 | 0 | } |
1583 | | |
1584 | | /* We have a rectangle, so send the text to fill that. */ |
1585 | 0 | tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0; |
1586 | 0 | tdev->last.trm.b = 0; |
1587 | 0 | tdev->last.trm.c = 0; |
1588 | 0 | tdev->last.trm.d = tdev->metatext->bounds.y0 - tdev->metatext->bounds.y1; |
1589 | 0 | tdev->last.trm.e = tdev->metatext->bounds.x0; |
1590 | 0 | tdev->last.trm.f = tdev->metatext->bounds.y1; |
1591 | 0 | tdev->last.valid = 1; |
1592 | |
|
1593 | 0 | fz_var(myfont); |
1594 | |
|
1595 | 0 | fz_try(ctx) |
1596 | 0 | { |
1597 | 0 | if (tdev->last.font == NULL) |
1598 | 0 | { |
1599 | 0 | myfont = fz_new_base14_font(ctx, "Helvetica"); |
1600 | 0 | tdev->last.font = myfont; |
1601 | 0 | } |
1602 | 0 | flush_actualtext(ctx, tdev, tdev->metatext->text, 0, -1); |
1603 | 0 | pop_metatext(ctx, tdev); |
1604 | 0 | } |
1605 | 0 | fz_always(ctx) |
1606 | 0 | { |
1607 | 0 | if (myfont) |
1608 | 0 | { |
1609 | 0 | tdev->last.font = NULL; |
1610 | 0 | fz_drop_font(ctx, myfont); |
1611 | 0 | } |
1612 | 0 | } |
1613 | 0 | fz_catch(ctx) |
1614 | 0 | fz_rethrow(ctx); |
1615 | 0 | } |
1616 | | |
1617 | | |
1618 | | /* Images and shadings */ |
1619 | | |
1620 | | static void |
1621 | | fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) |
1622 | 0 | { |
1623 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1624 | 0 | fz_rect *bounds = actualtext_bounds(tdev); |
1625 | | |
1626 | | /* If there is an actualtext in force, update its bounds. */ |
1627 | 0 | if (bounds) |
1628 | 0 | { |
1629 | 0 | static const fz_rect unit = { 0, 0, 1, 1 }; |
1630 | 0 | *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm)); |
1631 | 0 | } |
1632 | | |
1633 | | /* Unless we are being told to preserve images, nothing to do here. */ |
1634 | 0 | if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0) |
1635 | 0 | return; |
1636 | | |
1637 | | /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ |
1638 | 0 | if (alpha >= 0.5f) |
1639 | 0 | { |
1640 | 0 | fz_stext_block *block; |
1641 | 0 | flush_lazy_vectors(ctx, tdev->page, tdev); |
1642 | 0 | block = add_image_block_to_page(ctx, tdev->page, ctm, img, tdev->id); |
1643 | 0 | if (tdev->opts.flags & FZ_STEXT_CLIP) |
1644 | 0 | { |
1645 | 0 | fz_rect clip = fz_device_current_scissor(ctx, dev); |
1646 | 0 | clip = fz_intersect_rect(clip, tdev->page->mediabox); |
1647 | 0 | block->bbox = fz_intersect_rect(block->bbox, clip); |
1648 | 0 | } |
1649 | 0 | } |
1650 | 0 | } |
1651 | | |
1652 | | static void |
1653 | | fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, |
1654 | | fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params) |
1655 | 0 | { |
1656 | 0 | fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); |
1657 | 0 | } |
1658 | | |
1659 | | static fz_image * |
1660 | | fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor) |
1661 | 0 | { |
1662 | 0 | fz_matrix ctm = *in_out_ctm; |
1663 | 0 | fz_pixmap *pix; |
1664 | 0 | fz_image *img = NULL; |
1665 | 0 | fz_rect bounds; |
1666 | 0 | fz_irect bbox; |
1667 | |
|
1668 | 0 | bounds = fz_bound_shade(ctx, shade, ctm); |
1669 | 0 | bounds = fz_intersect_rect(bounds, scissor); |
1670 | 0 | bbox = fz_irect_from_rect(bounds); |
1671 | |
|
1672 | 0 | pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background); |
1673 | 0 | fz_try(ctx) |
1674 | 0 | { |
1675 | 0 | if (shade->use_background) |
1676 | 0 | fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params); |
1677 | 0 | else |
1678 | 0 | fz_clear_pixmap(ctx, pix); |
1679 | 0 | fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL); |
1680 | 0 | img = fz_new_image_from_pixmap(ctx, pix, NULL); |
1681 | 0 | } |
1682 | 0 | fz_always(ctx) |
1683 | 0 | fz_drop_pixmap(ctx, pix); |
1684 | 0 | fz_catch(ctx) |
1685 | 0 | fz_rethrow(ctx); |
1686 | | |
1687 | 0 | in_out_ctm->a = pix->w; |
1688 | 0 | in_out_ctm->b = 0; |
1689 | 0 | in_out_ctm->c = 0; |
1690 | 0 | in_out_ctm->d = pix->h; |
1691 | 0 | in_out_ctm->e = pix->x; |
1692 | 0 | in_out_ctm->f = pix->y; |
1693 | 0 | return img; |
1694 | 0 | } |
1695 | | |
1696 | | static void |
1697 | | fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params) |
1698 | 0 | { |
1699 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
1700 | 0 | fz_matrix local_ctm; |
1701 | 0 | fz_rect scissor; |
1702 | 0 | fz_image *image; |
1703 | | |
1704 | | /* If we aren't preserving images, don't waste time making the shade. */ |
1705 | 0 | if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0) |
1706 | 0 | { |
1707 | | /* But we do still need to handle actualtext bounds. */ |
1708 | 0 | fz_rect *bounds = actualtext_bounds(tdev); |
1709 | 0 | if (bounds) |
1710 | 0 | *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm)); |
1711 | 0 | return; |
1712 | 0 | } |
1713 | | |
1714 | 0 | local_ctm = ctm; |
1715 | 0 | scissor = fz_device_current_scissor(ctx, dev); |
1716 | 0 | if (dev->flags & FZ_STEXT_CLIP_RECT) |
1717 | 0 | scissor = fz_intersect_rect(scissor, tdev->opts.clip); |
1718 | 0 | scissor = fz_intersect_rect(scissor, tdev->page->mediabox); |
1719 | 0 | image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor); |
1720 | 0 | fz_try(ctx) |
1721 | 0 | fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params); |
1722 | 0 | fz_always(ctx) |
1723 | 0 | fz_drop_image(ctx, image); |
1724 | 0 | fz_catch(ctx) |
1725 | 0 | fz_rethrow(ctx); |
1726 | 0 | } |
1727 | | |
1728 | | static void |
1729 | | fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block) |
1730 | 0 | { |
1731 | 0 | fz_stext_line *line; |
1732 | 0 | fz_stext_char *ch; |
1733 | |
|
1734 | 0 | for ( ; block != NULL; block = block->next) |
1735 | 0 | { |
1736 | 0 | if (block->type == FZ_STEXT_BLOCK_STRUCT) |
1737 | 0 | { |
1738 | 0 | if (block->u.s.down) |
1739 | 0 | { |
1740 | 0 | fz_stext_block *block2; |
1741 | 0 | fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block); |
1742 | 0 | for (block2 = block->u.s.down->first_block; block2 != NULL; block2 = block2->next) |
1743 | 0 | { |
1744 | 0 | block->bbox = fz_union_rect(block->bbox, block2->bbox); |
1745 | 0 | } |
1746 | 0 | } |
1747 | 0 | } |
1748 | 0 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
1749 | 0 | continue; |
1750 | 0 | for (line = block->u.t.first_line; line; line = line->next) |
1751 | 0 | { |
1752 | 0 | int reorder = 0; |
1753 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
1754 | 0 | { |
1755 | 0 | fz_rect ch_box = fz_rect_from_quad(ch->quad); |
1756 | 0 | if (ch == line->first_char) |
1757 | 0 | line->bbox = ch_box; |
1758 | 0 | else |
1759 | 0 | line->bbox = fz_union_rect(line->bbox, ch_box); |
1760 | 0 | if (ch->bidi == 3) |
1761 | 0 | reorder = 1; |
1762 | 0 | } |
1763 | 0 | block->bbox = fz_union_rect(block->bbox, line->bbox); |
1764 | 0 | if (reorder) |
1765 | 0 | reverse_bidi_line(line); |
1766 | 0 | } |
1767 | 0 | } |
1768 | 0 | } |
1769 | | |
1770 | | static void |
1771 | | advance_to_x(fz_point *a, fz_point b, float x) |
1772 | 0 | { |
1773 | 0 | a->y += (b.y - a->y) * (x - a->x) / (b.x - a->x); |
1774 | 0 | a->x = x; |
1775 | 0 | } |
1776 | | |
1777 | | static void |
1778 | | advance_to_y(fz_point *a, fz_point b, float y) |
1779 | 0 | { |
1780 | 0 | a->x += (b.x - a->x) * (y - a->y) / (b.y - a->y); |
1781 | 0 | a->y = y; |
1782 | 0 | } |
1783 | | |
1784 | | static int |
1785 | | line_crosses_rect(fz_point a, fz_point b, fz_rect r) |
1786 | 0 | { |
1787 | | /* Cope with trivial exclusions */ |
1788 | 0 | if (a.x < r.x0 && b.x < r.x0) |
1789 | 0 | return 0; |
1790 | 0 | if (a.x > r.x1 && b.x > r.x1) |
1791 | 0 | return 0; |
1792 | 0 | if (a.y < r.y0 && b.y < r.y0) |
1793 | 0 | return 0; |
1794 | 0 | if (a.y > r.y1 && b.y > r.y1) |
1795 | 0 | return 0; |
1796 | | |
1797 | 0 | if (a.x < r.x0) |
1798 | 0 | advance_to_x(&a, b, r.x0); |
1799 | 0 | if (a.x > r.x1) |
1800 | 0 | advance_to_x(&a, b, r.x1); |
1801 | 0 | if (a.y < r.y0) |
1802 | 0 | advance_to_y(&a, b, r.y0); |
1803 | 0 | if (a.y > r.y1) |
1804 | 0 | advance_to_y(&a, b, r.y1); |
1805 | |
|
1806 | 0 | return fz_is_point_inside_rect(a, r); |
1807 | 0 | } |
1808 | | |
1809 | | static float |
1810 | | calculate_ascent(fz_point p, fz_point origin, fz_point dir) |
1811 | 0 | { |
1812 | 0 | return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x); |
1813 | 0 | } |
1814 | | |
1815 | | /* Create us a rect from the given quad, but extend it downwards |
1816 | | * to allow for underlines that pass under the glyphs. */ |
1817 | | static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size) |
1818 | 0 | { |
1819 | | /* Consider the two rects from A and g respectively. |
1820 | | * |
1821 | | * ul +------+ ur or |
1822 | | * | /\ | ul +------+ ur |
1823 | | * | /__\ | | /''\ | |
1824 | | * |/ \| |( || |
1825 | | * ll +------+ lr | ''''|| |
1826 | | * | ''' | <-expected underline level |
1827 | | * ll +------+ lr |
1828 | | * |
1829 | | * So an underline won't cross A's rect, but will cross g's. |
1830 | | * We want to make a rect that includes a suitable amount of |
1831 | | * space underneath. The information we have available to us |
1832 | | * is summed up here: |
1833 | | * |
1834 | | * ul +---------+ ur |
1835 | | * | | |
1836 | | * | origin | |
1837 | | * |+----------> dir |
1838 | | * | | |
1839 | | * ll +---------+ lr |
1840 | | * |
1841 | | * Consider the distance from ul to the line that passes through |
1842 | | * the origin with direction dir. Similarly, consider the distance |
1843 | | * from ur to the same line. This can be thought of as the 'ascent' |
1844 | | * of this character. |
1845 | | * |
1846 | | * We'd like the distance from ul to ll to be greater than this, so |
1847 | | * as to ensure we cover the possible location where an underline |
1848 | | * might reasonably go. |
1849 | | * |
1850 | | * If we have a line (l) through point A with direction vector u, |
1851 | | * the distance between point P and line(l) is: |
1852 | | * |
1853 | | * d(P,l) = || AP x u || / || u || |
1854 | | * |
1855 | | * where x is the cross product. |
1856 | | * |
1857 | | * For us, because || dir || = 1: |
1858 | | * |
1859 | | * d(ul, origin) = || (origin-ul) x dir || |
1860 | | * |
1861 | | * The cross product is only defined in 3 (or 7!) dimensions, so |
1862 | | * extend both vectors into 3d by defining a 0 z component. |
1863 | | * |
1864 | | * (origin-ul) x dir = [ (origin.y - ul.y) . 0 - 0 . dir.y ] |
1865 | | * [ 0 . dir.x - (origin.x - ul.y) . 0 ] |
1866 | | * [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ] |
1867 | | * |
1868 | | * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x |
1869 | | */ |
1870 | 0 | float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2; |
1871 | 0 | fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y }; |
1872 | 0 | fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y }; |
1873 | 0 | float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2; |
1874 | 0 | int neg = 0; |
1875 | 0 | float extra_rise = 0; |
1876 | | |
1877 | | /* Spaces will have 0 ascent. underscores will have small ascent. |
1878 | | * We want a sane ascent to be able to spot strikeouts, but not |
1879 | | * so big that it incorporates lines above the text, like borders. */ |
1880 | 0 | if (ascent < 0.75*size) |
1881 | 0 | extra_rise = 0.75*size - ascent; |
1882 | | |
1883 | | /* We'd like height to be at least ascent + 1/4 size */ |
1884 | 0 | if (height < 0) |
1885 | 0 | neg = 1, height = -height; |
1886 | 0 | if (height < ascent + size * 0.25f) |
1887 | 0 | height = ascent + size * 0.25f; |
1888 | |
|
1889 | 0 | height -= ascent; |
1890 | 0 | if (neg) |
1891 | 0 | height = -height; |
1892 | 0 | quad.ll.x += - height * dir.y; |
1893 | 0 | quad.ll.y += height * dir.x; |
1894 | 0 | quad.lr.x += - height * dir.y; |
1895 | 0 | quad.lr.y += height * dir.x; |
1896 | 0 | quad.ul.x -= - extra_rise * dir.y; |
1897 | 0 | quad.ul.y -= extra_rise * dir.x; |
1898 | 0 | quad.ur.x -= - extra_rise * dir.y; |
1899 | 0 | quad.ur.y -= extra_rise * dir.x; |
1900 | |
|
1901 | 0 | return fz_rect_from_quad(quad); |
1902 | 0 | } |
1903 | | |
1904 | | static int feq(float a,float b) |
1905 | 0 | { |
1906 | 0 | #define EPSILON 0.00001 |
1907 | 0 | a -= b; |
1908 | 0 | if (a < 0) |
1909 | 0 | a = -a; |
1910 | 0 | return a < EPSILON; |
1911 | 0 | } |
1912 | | |
1913 | | static void |
1914 | | check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir, float thickness) |
1915 | 0 | { |
1916 | 0 | for ( ; block; block = block->next) |
1917 | 0 | { |
1918 | 0 | fz_stext_line *line; |
1919 | |
|
1920 | 0 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
1921 | 0 | continue; |
1922 | | |
1923 | 0 | for (line = block->u.t.first_line; line != NULL; line = line->next) |
1924 | 0 | { |
1925 | 0 | fz_stext_char *ch; |
1926 | |
|
1927 | 0 | if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) && |
1928 | 0 | (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y))) |
1929 | 0 | continue; |
1930 | | |
1931 | | /* Matching directions... */ |
1932 | | |
1933 | | /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check |
1934 | | * chars. - FIXME: Now we do! */ |
1935 | 0 | for (ch = line->first_char; ch; ch = ch->next) |
1936 | 0 | { |
1937 | 0 | fz_point up; |
1938 | 0 | float dx, dy, dot; |
1939 | 0 | fz_rect ch_box; |
1940 | |
|
1941 | 0 | ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size); |
1942 | |
|
1943 | 0 | if (!line_crosses_rect(from, to, ch_box)) |
1944 | 0 | continue; |
1945 | | |
1946 | | /* If the thickness is more than a 1/4 of the size, it's a highlight, not a |
1947 | | * line! */ |
1948 | 0 | if (ch->size < thickness*4) |
1949 | 0 | { |
1950 | | /* Distinguish from a background fill */ |
1951 | 0 | if (thickness <= ch->size*1.5f) |
1952 | 0 | ch->flags |= FZ_STEXT_HIGHLIGHT; |
1953 | 0 | continue; |
1954 | 0 | } |
1955 | | |
1956 | | /* Is this a strikeout or an underline? */ |
1957 | | |
1958 | | /* The baseline moves from ch->origin in the direction line->dir */ |
1959 | 0 | up.x = line->dir.y; |
1960 | 0 | up.y = -line->dir.x; |
1961 | | |
1962 | | /* How far is our line displaced from the line through the origin? */ |
1963 | 0 | dx = from.x - ch->origin.x; |
1964 | 0 | dy = from.y - ch->origin.y; |
1965 | | /* Dot product with up. up is normalised */ |
1966 | 0 | dot = dx * up.x + dy * up.y; |
1967 | |
|
1968 | 0 | if (dot > 0 && dot <= 0.8f * ch->font->ascender * ch->size) |
1969 | 0 | ch->flags |= FZ_STEXT_STRIKEOUT; |
1970 | 0 | else |
1971 | 0 | ch->flags |= FZ_STEXT_UNDERLINE; |
1972 | 0 | } |
1973 | 0 | } |
1974 | 0 | } |
1975 | 0 | } |
1976 | | |
1977 | | static void |
1978 | | check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page) |
1979 | 0 | { |
1980 | 0 | int i, n = tdev->rect_len; |
1981 | |
|
1982 | 0 | for (i = 0; i < n; i++) |
1983 | 0 | { |
1984 | 0 | fz_point from = tdev->rects[i].from; |
1985 | 0 | fz_point to = tdev->rects[i].to; |
1986 | 0 | float thickness = tdev->rects[i].thickness; |
1987 | 0 | fz_point dir; |
1988 | 0 | dir.x = to.x - from.x; |
1989 | 0 | dir.y = to.y - from.y; |
1990 | 0 | dir = fz_normalize_vector(dir); |
1991 | |
|
1992 | 0 | check_strikeout(ctx, page->first_block, from, to, dir, thickness); |
1993 | 0 | } |
1994 | 0 | } |
1995 | | |
1996 | | static void |
1997 | | fz_stext_close_device(fz_context *ctx, fz_device *dev) |
1998 | 0 | { |
1999 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2000 | 0 | fz_stext_page *page = tdev->page; |
2001 | |
|
2002 | 0 | if ((tdev->flags & FZ_STEXT_DEHYPHENATE) && fz_is_unicode_hyphen(tdev->lastchar) && tdev->lastline != NULL) |
2003 | 0 | tdev->lastline->flags |= FZ_STEXT_LINE_FLAGS_JOINED; |
2004 | |
|
2005 | 0 | flush_lazy_vectors(ctx, page, tdev); |
2006 | |
|
2007 | 0 | fixup_bboxes_and_bidi(ctx, page->first_block); |
2008 | |
|
2009 | 0 | if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) |
2010 | 0 | check_rects_for_strikeout(ctx, tdev, page); |
2011 | | |
2012 | | /* TODO: smart sorting of blocks and lines in reading order */ |
2013 | | /* TODO: unicode NFC normalization */ |
2014 | |
|
2015 | 0 | if (tdev->opts.flags & FZ_STEXT_SEGMENT) |
2016 | 0 | fz_segment_stext_page(ctx, page); |
2017 | |
|
2018 | 0 | if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK) |
2019 | 0 | fz_paragraph_break(ctx, page); |
2020 | |
|
2021 | 0 | if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT) |
2022 | 0 | fz_table_hunt(ctx, page); |
2023 | 0 | } |
2024 | | |
2025 | | static void |
2026 | | fz_stext_drop_device(fz_context *ctx, fz_device *dev) |
2027 | 0 | { |
2028 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2029 | 0 | fz_drop_text(ctx, tdev->lasttext); |
2030 | 0 | fz_drop_font(ctx, tdev->last.font); |
2031 | 0 | while (tdev->metatext) |
2032 | 0 | pop_metatext(ctx, tdev); |
2033 | |
|
2034 | 0 | fz_free(ctx, tdev->rects); |
2035 | 0 | } |
2036 | | |
2037 | | static int |
2038 | | val_is_rect(const char *val, fz_rect *rp) |
2039 | 0 | { |
2040 | 0 | fz_rect r; |
2041 | 0 | const char *s; |
2042 | |
|
2043 | 0 | s = strchr(val, ':'); |
2044 | 0 | if (s == NULL || s == val) |
2045 | 0 | return 0; |
2046 | 0 | r.x0 = fz_atof(val); |
2047 | 0 | val = s+1; |
2048 | 0 | s = strchr(val, ':'); |
2049 | 0 | if (s == NULL || s == val) |
2050 | 0 | return 0; |
2051 | 0 | r.y0 = fz_atof(val); |
2052 | 0 | val = s+1; |
2053 | 0 | s = strchr(val, ':'); |
2054 | 0 | if (s == NULL || s == val) |
2055 | 0 | return 0; |
2056 | 0 | r.x1 = fz_atof(val); |
2057 | 0 | val = s+1; |
2058 | 0 | r.y1 = fz_atof(val); |
2059 | |
|
2060 | 0 | *rp = r; |
2061 | |
|
2062 | 0 | return 1; |
2063 | 0 | } |
2064 | | |
2065 | | void fz_init_stext_options(fz_context *ctx, fz_stext_options *opts) |
2066 | 0 | { |
2067 | 0 | memset(opts, 0, sizeof *opts); |
2068 | |
|
2069 | 0 | opts->flags |= FZ_STEXT_CLIP; |
2070 | 0 | opts->scale = 1; |
2071 | 0 | } |
2072 | | |
2073 | | fz_stext_options * |
2074 | | fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string) |
2075 | 0 | { |
2076 | 0 | fz_options *options = fz_new_options(ctx, string); |
2077 | 0 | fz_try(ctx) |
2078 | 0 | { |
2079 | 0 | fz_init_stext_options(ctx, opts); |
2080 | 0 | fz_apply_stext_options(ctx, opts, options); |
2081 | 0 | fz_throw_on_unused_options(ctx, options, "stext"); |
2082 | 0 | } |
2083 | 0 | fz_always(ctx) |
2084 | 0 | fz_drop_options(ctx, options); |
2085 | 0 | fz_catch(ctx) |
2086 | 0 | fz_rethrow(ctx); |
2087 | 0 | return opts; |
2088 | 0 | } |
2089 | | |
2090 | | #define SETCLEARBOOL(A, B, C) \ |
2091 | 0 | (A) = (B) ? ((A) | (C)) : ((A) & ~(C)) |
2092 | | |
2093 | | void |
2094 | | fz_apply_stext_options(fz_context *ctx, fz_stext_options *opts, fz_options *string) |
2095 | 0 | { |
2096 | 0 | const char *val; |
2097 | 0 | float x; |
2098 | 0 | int b; |
2099 | | |
2100 | | /* when adding options, remember to update fz_stext_options_usage above */ |
2101 | |
|
2102 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-ligatures", &b)) |
2103 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_LIGATURES); |
2104 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-whitespace", &b)) |
2105 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_WHITESPACE); |
2106 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-images", &b)) |
2107 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_IMAGES); |
2108 | 0 | if (fz_lookup_option_boolean(ctx, string, "inhibit-spaces", &b)) |
2109 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_INHIBIT_SPACES); |
2110 | 0 | if (fz_lookup_option_boolean(ctx, string, "dehyphenate", &b)) |
2111 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_DEHYPHENATE); |
2112 | 0 | if (fz_lookup_option_boolean(ctx, string, "preserve-spans", &b)) |
2113 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PRESERVE_SPANS); |
2114 | 0 | if (fz_lookup_option_boolean(ctx, string, "structured", &b)) |
2115 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STRUCTURE); |
2116 | 0 | if (fz_lookup_option_boolean(ctx, string, "use-cid-for-unknown-unicode", &b)) |
2117 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE); |
2118 | 0 | if (fz_lookup_option_boolean(ctx, string, "use-gid-for-unknown-unicode", &b)) |
2119 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE); |
2120 | 0 | if (fz_lookup_option_boolean(ctx, string, "accurate-bboxes", &b)) |
2121 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_BBOXES); |
2122 | 0 | if (fz_lookup_option_boolean(ctx, string, "vectors", &b)) |
2123 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_VECTORS); |
2124 | 0 | if (fz_lookup_option_boolean(ctx, string, "lazy-vectors", &b)) |
2125 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_LAZY_VECTORS); |
2126 | 0 | if (fz_lookup_option_boolean(ctx, string, "fuzzy-vectors", &b)) |
2127 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_FUZZY_VECTORS); |
2128 | 0 | if (fz_lookup_option_boolean(ctx, string, "ignore-actualtext", &b)) |
2129 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_IGNORE_ACTUALTEXT); |
2130 | 0 | if (fz_lookup_option_boolean(ctx, string, "segment", &b)) |
2131 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_SEGMENT); |
2132 | 0 | if (fz_lookup_option_boolean(ctx, string, "paragraph-break", &b)) |
2133 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_PARAGRAPH_BREAK); |
2134 | 0 | if (fz_lookup_option_boolean(ctx, string, "table-hunt", &b)) |
2135 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_TABLE_HUNT); |
2136 | 0 | if (fz_lookup_option_boolean(ctx, string, "collect-styles", &b)) |
2137 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_COLLECT_STYLES); |
2138 | 0 | if (fz_lookup_option_boolean(ctx, string, "accurate-ascenders", &b)) |
2139 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_ASCENDERS); |
2140 | 0 | if (fz_lookup_option_boolean(ctx, string, "accurate-side-bearings", &b)) |
2141 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_ACCURATE_SIDE_BEARINGS); |
2142 | |
|
2143 | 0 | if (fz_lookup_option_boolean(ctx, string, "mediabox-clip", &b)) |
2144 | 0 | { |
2145 | 0 | fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead."); |
2146 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP); |
2147 | 0 | } |
2148 | 0 | if (fz_lookup_option_boolean(ctx, string, "clip", &b)) |
2149 | 0 | SETCLEARBOOL(opts->flags, b, FZ_STEXT_CLIP); |
2150 | |
|
2151 | 0 | if (fz_lookup_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip)) |
2152 | 0 | opts->flags |= FZ_STEXT_CLIP_RECT; |
2153 | |
|
2154 | 0 | if (fz_lookup_option_float(ctx, string, "resolution", &x)) |
2155 | 0 | opts->scale = x / 96.0f; /* HTML base resolution is 96ppi */ |
2156 | |
|
2157 | 0 | fz_validate_options(ctx, string, "stext"); |
2158 | 0 | } |
2159 | | |
2160 | | typedef struct |
2161 | | { |
2162 | | int fail; |
2163 | | int count; |
2164 | | fz_point corners[4]; |
2165 | | } is_rect_data; |
2166 | | |
2167 | | static void |
2168 | | stash_point(is_rect_data *rd, float x, float y) |
2169 | 0 | { |
2170 | 0 | if (rd->count > 3) |
2171 | 0 | { |
2172 | 0 | rd->fail = 1; |
2173 | 0 | return; |
2174 | 0 | } |
2175 | | |
2176 | 0 | rd->corners[rd->count].x = x; |
2177 | 0 | rd->corners[rd->count].y = y; |
2178 | 0 | rd->count++; |
2179 | 0 | } |
2180 | | |
2181 | | static void |
2182 | | is_rect_moveto(fz_context *ctx, void *arg, float x, float y) |
2183 | 0 | { |
2184 | 0 | is_rect_data *rd = arg; |
2185 | 0 | if (rd->fail) |
2186 | 0 | return; |
2187 | | |
2188 | 0 | if (rd->count != 0) |
2189 | 0 | { |
2190 | 0 | rd->fail = 1; |
2191 | 0 | return; |
2192 | 0 | } |
2193 | 0 | stash_point(rd, x, y); |
2194 | 0 | } |
2195 | | |
2196 | | static void |
2197 | | is_rect_lineto(fz_context *ctx, void *arg, float x, float y) |
2198 | 0 | { |
2199 | 0 | is_rect_data *rd = arg; |
2200 | 0 | if (rd->fail) |
2201 | 0 | return; |
2202 | | |
2203 | 0 | if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y) |
2204 | 0 | return; |
2205 | | |
2206 | 0 | stash_point(rd, x, y); |
2207 | 0 | } |
2208 | | |
2209 | | static void |
2210 | | is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3) |
2211 | 0 | { |
2212 | 0 | is_rect_data *rd = arg; |
2213 | 0 | rd->fail = 1; |
2214 | 0 | } |
2215 | | |
2216 | | static void |
2217 | | is_rect_closepath(fz_context *ctx, void *arg) |
2218 | 0 | { |
2219 | 0 | is_rect_data *rd = arg; |
2220 | 0 | if (rd->fail) |
2221 | 0 | return; |
2222 | 0 | if (rd->count == 3) |
2223 | 0 | stash_point(rd, rd->corners[0].x, rd->corners[0].y); |
2224 | 0 | if (rd->count != 4) |
2225 | 0 | rd->fail = 1; |
2226 | 0 | } |
2227 | | |
2228 | | static int |
2229 | | is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm, fz_rect *r) |
2230 | 0 | { |
2231 | 0 | float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y; |
2232 | 0 | is_rect_data rd = { 0 }; |
2233 | 0 | static const fz_path_walker walker = |
2234 | 0 | { |
2235 | 0 | is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath |
2236 | 0 | }; |
2237 | 0 | int i; |
2238 | |
|
2239 | 0 | fz_walk_path(ctx, path, &walker, &rd); |
2240 | |
|
2241 | 0 | if (rd.fail) |
2242 | 0 | return 0; |
2243 | | |
2244 | 0 | if (rd.count == 2) |
2245 | 0 | { |
2246 | 0 | stash_point(&rd, rd.corners[1].x, rd.corners[1].y); |
2247 | 0 | stash_point(&rd, rd.corners[0].x, rd.corners[0].y); |
2248 | 0 | } |
2249 | |
|
2250 | 0 | for (i = 0 ; i < 4; i++) |
2251 | 0 | { |
2252 | 0 | fz_point p = fz_transform_point(rd.corners[i], ctm); |
2253 | |
|
2254 | 0 | rd.corners[i].x = p.x; |
2255 | 0 | rd.corners[i].y = p.y; |
2256 | 0 | } |
2257 | | |
2258 | | /* So we have a 4 cornered path. Hopefully something like: |
2259 | | * 0---------1 |
2260 | | * | | |
2261 | | * 3---------2 |
2262 | | * but it might be: |
2263 | | * 0---------3 |
2264 | | * | | |
2265 | | * 1---------2 |
2266 | | */ |
2267 | 0 | while (1) |
2268 | 0 | { |
2269 | 0 | d01x = rd.corners[1].x - rd.corners[0].x; |
2270 | 0 | d01y = rd.corners[1].y - rd.corners[0].y; |
2271 | 0 | d01 = d01x * d01x + d01y * d01y; |
2272 | 0 | d03x = rd.corners[3].x - rd.corners[0].x; |
2273 | 0 | d03y = rd.corners[3].y - rd.corners[0].y; |
2274 | 0 | d03 = d03x * d03x + d03y * d03y; |
2275 | 0 | if(d01 < d03) |
2276 | 0 | { |
2277 | | /* We are the latter case. Transpose it. */ |
2278 | 0 | fz_point p = rd.corners[1]; |
2279 | 0 | rd.corners[1] = rd.corners[3]; |
2280 | 0 | rd.corners[3] = p; |
2281 | 0 | } |
2282 | 0 | else |
2283 | 0 | break; |
2284 | 0 | } |
2285 | 0 | d32x = rd.corners[2].x - rd.corners[3].x; |
2286 | 0 | d32y = rd.corners[2].y - rd.corners[3].y; |
2287 | | |
2288 | | /* So d32x and d01x need to be the same for this to be a strikeout. */ |
2289 | 0 | if (!feq(d32x, d01x) || !feq(d32y, d01y)) |
2290 | 0 | return 0; |
2291 | | |
2292 | | /* We are plausibly a rectangle. */ |
2293 | 0 | *thickness = sqrtf(d03x * d03x + d03y * d03y); |
2294 | |
|
2295 | 0 | from->x = (rd.corners[0].x + rd.corners[3].x)/2; |
2296 | 0 | from->y = (rd.corners[0].y + rd.corners[3].y)/2; |
2297 | 0 | to->x = (rd.corners[1].x + rd.corners[2].x)/2; |
2298 | 0 | to->y = (rd.corners[1].y + rd.corners[2].y)/2; |
2299 | |
|
2300 | 0 | *r = fz_empty_rect; |
2301 | 0 | if ((rd.corners[0].x == rd.corners[3].x && rd.corners[1].x == rd.corners[2].x && |
2302 | 0 | rd.corners[0].y == rd.corners[1].y && rd.corners[2].y == rd.corners[3].y) || |
2303 | 0 | (rd.corners[0].x == rd.corners[1].x && rd.corners[3].x == rd.corners[2].x && |
2304 | 0 | rd.corners[0].y == rd.corners[3].y && rd.corners[2].y == rd.corners[1].y)) |
2305 | 0 | { |
2306 | 0 | *r = fz_include_point_in_rect(*r, rd.corners[0]); |
2307 | 0 | *r = fz_include_point_in_rect(*r, rd.corners[2]); |
2308 | 0 | } |
2309 | |
|
2310 | 0 | return 1; |
2311 | 0 | } |
2312 | | |
2313 | | static void |
2314 | | check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm, int argb) |
2315 | 0 | { |
2316 | 0 | float thickness; |
2317 | 0 | fz_point from, to; |
2318 | 0 | int i, n = tdev->rect_len; |
2319 | 0 | fz_rect r; |
2320 | | |
2321 | | /* Is this path a thin rectangle (possibly rotated)? If so, then we need to |
2322 | | * consider it as being a strikeout or underline. */ |
2323 | 0 | if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm, &r)) |
2324 | 0 | return; |
2325 | | |
2326 | | /* If we've already had a rectangle of the same colour that covers this region |
2327 | | * then that was probably a cell background color, and this is probably a |
2328 | | * text string background fill. This is not a highlight, or underline or |
2329 | | * strikeout, so don't keep it. */ |
2330 | 0 | for (i = 0; i < n; i++) |
2331 | 0 | { |
2332 | 0 | rect_details *rct = &tdev->rects[i]; |
2333 | 0 | if (rct->argb == argb && fz_contains_rect(rct->rect, r)) |
2334 | 0 | return; |
2335 | 0 | } |
2336 | | |
2337 | | /* Add to the list of rects in the device. */ |
2338 | 0 | if (tdev->rect_len == tdev->rect_max) |
2339 | 0 | { |
2340 | 0 | int newmax = tdev->rect_max * 2; |
2341 | 0 | if (newmax == 0) |
2342 | 0 | newmax = 32; |
2343 | |
|
2344 | 0 | tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax); |
2345 | 0 | tdev->rect_max = newmax; |
2346 | 0 | } |
2347 | 0 | tdev->rects[tdev->rect_len].from = from; |
2348 | 0 | tdev->rects[tdev->rect_len].to = to; |
2349 | 0 | tdev->rects[tdev->rect_len].thickness = thickness; |
2350 | 0 | tdev->rects[tdev->rect_len].rect = r; |
2351 | 0 | tdev->rects[tdev->rect_len].argb = argb; |
2352 | 0 | tdev->rect_len++; |
2353 | 0 | } |
2354 | | |
2355 | | static void |
2356 | | add_vector(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, fz_rect bbox, uint32_t flags, uint32_t argb, int id, float exp) |
2357 | 0 | { |
2358 | 0 | fz_stext_block *b; |
2359 | |
|
2360 | 0 | if (exp != 0) |
2361 | 0 | { |
2362 | 0 | bbox.x0 -= exp; |
2363 | 0 | bbox.y0 -= exp; |
2364 | 0 | bbox.x1 += exp; |
2365 | 0 | bbox.y1 += exp; |
2366 | 0 | } |
2367 | |
|
2368 | 0 | if (tdev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP)) |
2369 | 0 | { |
2370 | 0 | fz_rect r = current_clip(ctx, tdev); |
2371 | 0 | bbox = fz_intersect_rect(bbox, r); |
2372 | 0 | if (!fz_is_valid_rect(bbox)) |
2373 | 0 | return; |
2374 | 0 | } |
2375 | | |
2376 | | /* Can we just add this one onto the previous one? */ |
2377 | | /* Only if it's a small rectangle... */ |
2378 | 0 | if ((flags & FZ_STEXT_VECTOR_IS_RECTANGLE) && bbox.x1 - bbox.x0 <= 2 && bbox.y1 - bbox.y0 <= 2) |
2379 | 0 | { |
2380 | 0 | fz_stext_block *prev; |
2381 | | /* Find b = the previous block. */ |
2382 | 0 | if (tdev->flags & FZ_STEXT_LAZY_VECTORS) |
2383 | 0 | b = tdev->lazy_vectors_tail; |
2384 | 0 | else if (page->last_struct) |
2385 | 0 | b = page->last_struct->last_block; |
2386 | 0 | else |
2387 | 0 | b = page->last_block; |
2388 | |
|
2389 | 0 | if (b && b->type == FZ_STEXT_BLOCK_VECTOR && b->u.v.argb == argb && b->u.v.flags == flags) |
2390 | 0 | { |
2391 | | /* Maybe we can join it? */ |
2392 | 0 | float fudge = 0.001f; |
2393 | 0 | if (b->bbox.x0 == bbox.x0 && b->bbox.x1 == bbox.x1 && b->bbox.y1 + fudge >= bbox.y0 && b->bbox.y0 - fudge <= bbox.y1) |
2394 | 0 | { |
2395 | | /* Stacks vertically. */ |
2396 | 0 | b->bbox.y0 = fz_min(b->bbox.y0, bbox.y0); |
2397 | 0 | b->bbox.y1 = fz_max(b->bbox.y1, bbox.y1); |
2398 | 0 | return; |
2399 | 0 | } |
2400 | 0 | else if (b->bbox.y0 == bbox.y0 && b->bbox.y1 == bbox.y1 && b->bbox.x1 + fudge >= bbox.x0 && b->bbox.x0 - fudge <= bbox.x1) |
2401 | 0 | { |
2402 | | /* Stacks horizontally. */ |
2403 | 0 | b->bbox.x0 = fz_min(b->bbox.x0, bbox.x0); |
2404 | 0 | b->bbox.x1 = fz_max(b->bbox.x1, bbox.x1); |
2405 | 0 | return; |
2406 | 0 | } |
2407 | | |
2408 | | /* So, we can't add our new vector onto the previous one. But can we merge the 2 previous ones? */ |
2409 | | /* The intent here is that we allow a set of vector 'blocks' to be merged together, perhaps: |
2410 | | * ABC |
2411 | | * Then we allow another set to be merged together, perhaps DE: |
2412 | | * ABC |
2413 | | * DE |
2414 | | * Then when we get another block that can't be merged into DE (perhaps F): |
2415 | | * ABC |
2416 | | * DE |
2417 | | * F |
2418 | | * We'll consider ABC and DE for merging. Whatevever block that F ends up |
2419 | | * in later (maybe FGH): |
2420 | | * ABC |
2421 | | * DE |
2422 | | * FGH |
2423 | | * will be considered for merging later. We can always do this "exactly" (if the blocks |
2424 | | * line up precisely), but to do this 'lossily', we guard it with 'FUZZY_VECTORS'. |
2425 | | */ |
2426 | 0 | prev = b->prev; |
2427 | 0 | while (prev && prev->type == FZ_STEXT_BLOCK_VECTOR && (prev->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE)) |
2428 | 0 | { |
2429 | | /* Lossless merging. */ |
2430 | 0 | if (prev->bbox.x0 == b->bbox.x0 && prev->bbox.x1 == b->bbox.x1 && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1) |
2431 | 0 | { |
2432 | | /* Stacks exactly vertically. Very rarely hit. */ |
2433 | 0 | prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0); |
2434 | 0 | prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1); |
2435 | 0 | return; |
2436 | 0 | } |
2437 | 0 | else if (prev->bbox.y0 == b->bbox.y0 && prev->bbox.y1 == b->bbox.y1 && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1) |
2438 | 0 | { |
2439 | | /* Stacks horizontally. Very rarely hit. */ |
2440 | 0 | prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0); |
2441 | 0 | prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1); |
2442 | 0 | return; |
2443 | 0 | } |
2444 | 0 | if (tdev->flags & FZ_STEXT_FUZZY_VECTORS) |
2445 | 0 | { |
2446 | | /* Be more forgiving in how we merge vectors */ |
2447 | | /* We need to be careful not to merge together differently oriented borders for table cells. |
2448 | | * C |
2449 | | * | |
2450 | | * v |
2451 | | * +-----+-----+ |
2452 | | * A-> | | | |
2453 | | * +-----+-----+ |
2454 | | * B-> | | | |
2455 | | * +-----+-----+ |
2456 | | * |
2457 | | * It'd be fine to merge borders A and B together, because it still signifies the same |
2458 | | * edges. It would NOT be fine to merge A and C together, because we'd lose the sense |
2459 | | * of them being borders, and just have a blob that covered the cell. |
2460 | | * The fudge2 logic below should hopefully allow for this, as well as allowing us to |
2461 | | * match blocks like: |
2462 | | * ABC |
2463 | | * DE FG |
2464 | | * HIJ |
2465 | | * KL MN |
2466 | | * OPQ |
2467 | | */ |
2468 | 0 | float fudge2 = 2; |
2469 | 0 | if ((fabsf(prev->bbox.x0 - b->bbox.x0) <= fudge2 || fabsf(prev->bbox.x1 - b->bbox.x1) <= fudge2) && prev->bbox.y1 + fudge >= b->bbox.y0 && prev->bbox.y0 - fudge <= b->bbox.y1) |
2470 | 0 | { |
2471 | | /* Stacks vertically. */ |
2472 | 0 | goto join; |
2473 | 0 | } |
2474 | 0 | else if ((fabsf(prev->bbox.y0 - b->bbox.y0) <= fudge2 || fabsf(prev->bbox.y1 - b->bbox.y1) <= fudge2) && prev->bbox.x1 + fudge >= b->bbox.x0 && prev->bbox.x0 - fudge <= b->bbox.x1) |
2475 | 0 | { |
2476 | | /* Stacks horizontally. */ |
2477 | 0 | join: |
2478 | 0 | prev->bbox.x0 = fz_min(prev->bbox.x0, b->bbox.x0); |
2479 | 0 | prev->bbox.x1 = fz_max(prev->bbox.x1, b->bbox.x1); |
2480 | 0 | prev->bbox.y0 = fz_min(prev->bbox.y0, b->bbox.y0); |
2481 | 0 | prev->bbox.y1 = fz_max(prev->bbox.y1, b->bbox.y1); |
2482 | | /* Unlink b (so, fiddle with b->prev, which is not necessarily prev!) */ |
2483 | 0 | b->prev->next = NULL; |
2484 | 0 | if (tdev->flags & FZ_STEXT_LAZY_VECTORS) |
2485 | 0 | tdev->lazy_vectors_tail = b->prev; |
2486 | 0 | else if (page->last_struct) |
2487 | 0 | page->last_struct->last_block = b->prev; |
2488 | 0 | else |
2489 | 0 | page->last_block = b->prev; |
2490 | 0 | break; |
2491 | 0 | } |
2492 | 0 | } |
2493 | | /* Now, allow for looking further back. */ |
2494 | 0 | prev = prev->prev; |
2495 | 0 | } |
2496 | 0 | } |
2497 | 0 | } |
2498 | | |
2499 | 0 | if (tdev->flags & FZ_STEXT_LAZY_VECTORS) |
2500 | 0 | b = add_lazy_vector(ctx, page, tdev, id); |
2501 | 0 | else |
2502 | 0 | b = add_block_to_page(ctx, page, FZ_STEXT_BLOCK_VECTOR, id); |
2503 | |
|
2504 | 0 | b->bbox = bbox; |
2505 | 0 | b->u.v.flags = flags; |
2506 | 0 | b->u.v.argb = argb; |
2507 | 0 | } |
2508 | | |
2509 | | typedef struct |
2510 | | { |
2511 | | fz_stext_device *dev; |
2512 | | fz_matrix ctm; |
2513 | | uint32_t argb; |
2514 | | uint32_t flags; |
2515 | | fz_stext_page *page; |
2516 | | fz_rect seg_bounds; |
2517 | | fz_rect leftovers; |
2518 | | fz_rect pending; |
2519 | | int count; |
2520 | | fz_point p[5]; |
2521 | | int id; |
2522 | | float exp; |
2523 | | } split_path_data; |
2524 | | |
2525 | | static void |
2526 | | maybe_rect(fz_context *ctx, split_path_data *sp) |
2527 | 0 | { |
2528 | 0 | int rect = 0; |
2529 | 0 | int i; |
2530 | 0 | fz_rect leftovers; |
2531 | |
|
2532 | 0 | if (sp->count >= 3) |
2533 | 0 | { |
2534 | | /* Allow for multiple monotonic points in a horizontal or vertical line, |
2535 | | * such as seen in borders of tables where each column or row is written |
2536 | | * individually. (e.g. move 0 0 line 100 0 line 200 0 line 300 0) */ |
2537 | 0 | if (feq(sp->p[sp->count-1].x, sp->p[sp->count-2].x) && |
2538 | 0 | feq(sp->p[sp->count-1].x, sp->p[sp->count-3].x) && |
2539 | 0 | ((sp->p[sp->count-1].y <= sp->p[sp->count-2].y && sp->p[sp->count-2].y <= sp->p[sp->count-3].y) || |
2540 | 0 | (sp->p[sp->count-1].y >= sp->p[sp->count-2].y && sp->p[sp->count-2].y >= sp->p[sp->count-3].y))) |
2541 | 0 | { |
2542 | | /* y---->y---->y - Remove the central y */ |
2543 | 0 | sp->p[sp->count-2].y = sp->p[sp->count-1].y; |
2544 | 0 | sp->count--; |
2545 | 0 | } |
2546 | 0 | else if (feq(sp->p[sp->count-1].y, sp->p[sp->count-2].y) && |
2547 | 0 | feq(sp->p[sp->count-1].y, sp->p[sp->count-3].y) && |
2548 | 0 | ((sp->p[sp->count-1].x <= sp->p[sp->count-2].x && sp->p[sp->count-2].x <= sp->p[sp->count-3].x) || |
2549 | 0 | (sp->p[sp->count-1].x >= sp->p[sp->count-2].x && sp->p[sp->count-2].x >= sp->p[sp->count-3].x))) |
2550 | 0 | { |
2551 | | /* x---->x---->x - Remove the central x */ |
2552 | 0 | sp->p[sp->count-2].x = sp->p[sp->count-1].x; |
2553 | 0 | sp->count--; |
2554 | 0 | } |
2555 | 0 | } |
2556 | |
|
2557 | 0 | if (sp->count >= 0) |
2558 | 0 | { |
2559 | 0 | if (sp->count == 3) |
2560 | 0 | { |
2561 | | /* Allow for "moveto A, lineto B, lineto A, close" */ |
2562 | 0 | if (feq(sp->p[0].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[2].y)) |
2563 | 0 | sp->count = 2; |
2564 | 0 | } |
2565 | 0 | if (sp->count == 2) |
2566 | 0 | { |
2567 | 0 | if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y)) |
2568 | 0 | rect = 1; /* Count that as a rect */ |
2569 | 0 | } |
2570 | 0 | else if (sp->count == 4 || sp->count == 5) |
2571 | 0 | { |
2572 | 0 | if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y)) |
2573 | 0 | rect = 1; |
2574 | 0 | else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y)) |
2575 | 0 | rect = 1; |
2576 | 0 | } |
2577 | 0 | if (rect) |
2578 | 0 | { |
2579 | 0 | fz_rect bounds; |
2580 | |
|
2581 | 0 | bounds.x0 = bounds.x1 = sp->p[0].x; |
2582 | 0 | bounds.y0 = bounds.y1 = sp->p[0].y; |
2583 | 0 | for (i = 1; i < sp->count; i++) |
2584 | 0 | bounds = fz_include_point_in_rect(bounds, sp->p[i]); |
2585 | 0 | if (fz_is_valid_rect(sp->pending)) |
2586 | 0 | add_vector(ctx, sp->page, sp->dev, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb, sp->id, sp->exp); |
2587 | 0 | sp->pending = bounds; |
2588 | 0 | return; |
2589 | 0 | } |
2590 | 0 | } |
2591 | | |
2592 | | /* We aren't a rectangle! */ |
2593 | 0 | leftovers = sp->seg_bounds; |
2594 | |
|
2595 | 0 | if (sp->dev->flags & (FZ_STEXT_CLIP_RECT | FZ_STEXT_CLIP)) |
2596 | 0 | leftovers = fz_intersect_rect(leftovers, current_clip(ctx, sp->dev)); |
2597 | |
|
2598 | 0 | if (fz_is_valid_rect(leftovers)) |
2599 | 0 | sp->leftovers = fz_union_rect(sp->leftovers, leftovers); |
2600 | | |
2601 | | /* Remember we're not a rect. */ |
2602 | 0 | sp->count = -1; |
2603 | 0 | } |
2604 | | |
2605 | | static void |
2606 | | split_move(fz_context *ctx, void *arg, float x, float y) |
2607 | 0 | { |
2608 | 0 | split_path_data *sp = (split_path_data *)arg; |
2609 | 0 | fz_point p = fz_transform_point_xy(x, y, sp->ctm); |
2610 | |
|
2611 | 0 | maybe_rect(ctx, sp); |
2612 | 0 | sp->p[0] = p; |
2613 | 0 | sp->count = 1; |
2614 | 0 | sp->seg_bounds.x0 = sp->seg_bounds.x1 = p.x; |
2615 | 0 | sp->seg_bounds.y0 = sp->seg_bounds.y1 = p.y; |
2616 | 0 | } |
2617 | | |
2618 | | static void |
2619 | | split_line(fz_context *ctx, void *arg, float x, float y) |
2620 | 0 | { |
2621 | 0 | split_path_data *sp = (split_path_data *)arg; |
2622 | 0 | fz_point p = fz_transform_point_xy(x, y, sp->ctm); |
2623 | |
|
2624 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, p); |
2625 | |
|
2626 | 0 | if (sp->count >= 0) |
2627 | 0 | { |
2628 | | /* Check for lines to the same point. */ |
2629 | 0 | if (sp->count > 0 && feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y)) |
2630 | 0 | return; |
2631 | | /* If we're still maybe a rect, just record the point. */ |
2632 | 0 | if (sp->count < 4) |
2633 | 0 | { |
2634 | 0 | sp->p[sp->count++] = p; |
2635 | 0 | return; |
2636 | 0 | } |
2637 | | /* Check for close line? */ |
2638 | 0 | if (sp->count == 4) |
2639 | 0 | { |
2640 | 0 | if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y)) |
2641 | 0 | { |
2642 | | /* We've just drawn a line back to the start point. */ |
2643 | | /* Needless saving of point, but it makes the logic |
2644 | | * easier elsewhere. */ |
2645 | 0 | sp->p[sp->count++] = p; |
2646 | 0 | return; |
2647 | 0 | } |
2648 | 0 | } |
2649 | | /* We can no longer be a rect. */ |
2650 | 0 | sp->count = -1; |
2651 | 0 | } |
2652 | 0 | } |
2653 | | |
2654 | | static void |
2655 | | split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3) |
2656 | 0 | { |
2657 | 0 | split_path_data *sp = (split_path_data *)arg; |
2658 | |
|
2659 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x1, y1, sp->ctm)); |
2660 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x2, y2, sp->ctm)); |
2661 | 0 | sp->seg_bounds = fz_include_point_in_rect(sp->seg_bounds, fz_transform_point_xy(x3, y3, sp->ctm)); |
2662 | | |
2663 | | /* We can no longer be a rect. */ |
2664 | 0 | sp->count = -1; |
2665 | 0 | } |
2666 | | |
2667 | | static void |
2668 | | split_close(fz_context *ctx, void *arg) |
2669 | 0 | { |
2670 | 0 | split_path_data *sp = (split_path_data *)arg; |
2671 | |
|
2672 | 0 | maybe_rect(ctx, sp); |
2673 | 0 | sp->count = 0; |
2674 | 0 | } |
2675 | | |
2676 | | |
2677 | | static const |
2678 | | fz_path_walker split_path_rects = |
2679 | | { |
2680 | | split_move, |
2681 | | split_line, |
2682 | | split_curve, |
2683 | | split_close |
2684 | | }; |
2685 | | |
2686 | | static void |
2687 | | add_vectors_from_path(fz_context *ctx, fz_stext_page *page, fz_stext_device *tdev, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, const fz_stroke_state *stroke, float exp) |
2688 | 0 | { |
2689 | 0 | int have_leftovers; |
2690 | 0 | split_path_data sp; |
2691 | 0 | int id = tdev->id; |
2692 | 0 | int trailing_moves_acceptable = (stroke == NULL || stroke->end_cap != FZ_LINECAP_ROUND); |
2693 | |
|
2694 | 0 | sp.dev = tdev; |
2695 | 0 | sp.ctm = ctm; |
2696 | 0 | sp.argb = hexrgba_from_color(ctx, cs, color, alpha); |
2697 | 0 | sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0; |
2698 | 0 | sp.page = page; |
2699 | 0 | sp.count = 0; |
2700 | 0 | sp.leftovers = fz_empty_rect; |
2701 | 0 | sp.seg_bounds = fz_empty_rect; |
2702 | 0 | sp.pending = fz_empty_rect; |
2703 | 0 | sp.id = id; |
2704 | 0 | sp.exp = exp; |
2705 | 0 | fz_walk_path(ctx, path, &split_path_rects, &sp); |
2706 | |
|
2707 | 0 | have_leftovers = fz_is_valid_rect(sp.leftovers); |
2708 | |
|
2709 | 0 | if (!trailing_moves_acceptable || sp.count != 1) |
2710 | 0 | maybe_rect(ctx, &sp); |
2711 | |
|
2712 | 0 | if ((!trailing_moves_acceptable || sp.count != 1) && fz_is_valid_rect(sp.pending)) |
2713 | 0 | add_vector(ctx, page, sp.dev, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb, id, exp); |
2714 | 0 | if (have_leftovers) |
2715 | 0 | add_vector(ctx, page, sp.dev, sp.leftovers, sp.flags, sp.argb, id, exp); |
2716 | 0 | } |
2717 | | |
2718 | | static void |
2719 | | fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp) |
2720 | 0 | { |
2721 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2722 | 0 | fz_stext_page *page = tdev->page; |
2723 | 0 | fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm); |
2724 | 0 | fz_rect *bounds = actualtext_bounds(tdev); |
2725 | | |
2726 | | /* If we're in an actualtext, then update the bounds to include this content. */ |
2727 | 0 | if (bounds != NULL) |
2728 | 0 | *bounds = fz_union_rect(*bounds, path_bounds); |
2729 | |
|
2730 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_STYLES) |
2731 | 0 | check_for_strikeout(ctx, tdev, page, path, ctm, hexrgba_from_color(ctx, cs, color, alpha)); |
2732 | |
|
2733 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_VECTORS) |
2734 | 0 | add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, NULL, 0); |
2735 | 0 | } |
2736 | | |
2737 | | static void |
2738 | | fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp) |
2739 | 0 | { |
2740 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2741 | 0 | fz_stext_page *page = tdev->page; |
2742 | 0 | fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm); |
2743 | 0 | fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev); |
2744 | 0 | float exp = ss->linewidth / 2; |
2745 | | |
2746 | | /* If we're in an actualtext, then update the bounds to include this content. */ |
2747 | 0 | if (bounds != NULL) |
2748 | 0 | *bounds = fz_union_rect(*bounds, path_bounds); |
2749 | |
|
2750 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_STYLES) |
2751 | 0 | check_for_strikeout(ctx, tdev, page, path, ctm, hexrgba_from_color(ctx, cs, color, alpha)); |
2752 | |
|
2753 | 0 | if (tdev->flags & FZ_STEXT_COLLECT_VECTORS) |
2754 | 0 | add_vectors_from_path(ctx, page, tdev, path, ctm, cs, color, alpha, cp, ss, exp); |
2755 | 0 | } |
2756 | | |
2757 | | static void |
2758 | | new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw) |
2759 | 0 | { |
2760 | 0 | fz_stext_struct *str; |
2761 | 0 | size_t z; |
2762 | |
|
2763 | 0 | if (raw == NULL) |
2764 | 0 | raw = ""; |
2765 | 0 | z = strlen(raw); |
2766 | |
|
2767 | 0 | str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1); |
2768 | 0 | str->first_block = NULL; |
2769 | 0 | str->last_block = NULL; |
2770 | 0 | str->standard = standard; |
2771 | 0 | str->parent = page->last_struct; |
2772 | 0 | str->up = block; |
2773 | 0 | memcpy(str->raw, raw, z+1); |
2774 | |
|
2775 | 0 | block->u.s.down = str; |
2776 | 0 | } |
2777 | | |
2778 | | fz_stext_block * |
2779 | | fz_new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_structure standard, const char *raw, int idx) |
2780 | 0 | { |
2781 | 0 | fz_stext_block *block; |
2782 | |
|
2783 | 0 | block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
2784 | 0 | block->bbox = fz_empty_rect; |
2785 | 0 | block->prev = NULL; |
2786 | 0 | block->next = NULL; |
2787 | 0 | block->type = FZ_STEXT_BLOCK_STRUCT; |
2788 | 0 | block->u.s.index = idx; |
2789 | 0 | block->u.s.down = NULL; |
2790 | | /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */ |
2791 | 0 | new_stext_struct(ctx, page, block, standard, raw); |
2792 | |
|
2793 | 0 | return block; |
2794 | 0 | } |
2795 | | |
2796 | | |
2797 | | static void |
2798 | | fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx) |
2799 | 0 | { |
2800 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2801 | 0 | fz_stext_page *page = tdev->page; |
2802 | 0 | fz_stext_block *block, *le, *gt, *newblock; |
2803 | |
|
2804 | 0 | if (raw == NULL) |
2805 | 0 | raw = ""; |
2806 | | |
2807 | | /* Find a pointer to the last block. */ |
2808 | 0 | if (page->last_block) |
2809 | 0 | { |
2810 | 0 | block = page->last_block; |
2811 | 0 | } |
2812 | 0 | else if (page->last_struct) |
2813 | 0 | { |
2814 | 0 | block = page->last_struct->last_block; |
2815 | 0 | } |
2816 | 0 | else |
2817 | 0 | { |
2818 | 0 | block = page->first_block; |
2819 | 0 | } |
2820 | | |
2821 | | /* So block is somewhere in the content chain. Let's try and find: |
2822 | | * le = the struct node <= idx before block in the content chain. |
2823 | | * ge = the struct node >= idx after block in the content chain. |
2824 | | * Search backwards to start with. |
2825 | | */ |
2826 | 0 | gt = NULL; |
2827 | 0 | le = block; |
2828 | 0 | while (le) |
2829 | 0 | { |
2830 | 0 | if (le->type == FZ_STEXT_BLOCK_STRUCT) |
2831 | 0 | { |
2832 | 0 | if (le->u.s.index > idx) |
2833 | 0 | gt = le; |
2834 | 0 | if (le->u.s.index <= idx) |
2835 | 0 | break; |
2836 | 0 | } |
2837 | 0 | le = le->prev; |
2838 | 0 | } |
2839 | | /* The following loop copes with finding gt (the smallest block with an index higher |
2840 | | * than we want) if we haven't found it already. The while loop in here was designed |
2841 | | * to cope with 'block' being in the middle of a list. In fact, the way the code is |
2842 | | * currently, block will always be at the end of a list, so the while won't do anything. |
2843 | | * But I'm loathe to remove it in case we ever change this code to start from wherever |
2844 | | * we did the last insertion. */ |
2845 | 0 | if (gt == NULL) |
2846 | 0 | { |
2847 | 0 | gt = block; |
2848 | 0 | while (gt) |
2849 | 0 | { |
2850 | 0 | if (gt->type == FZ_STEXT_BLOCK_STRUCT) |
2851 | 0 | { |
2852 | 0 | if (gt->u.s.index <= idx) |
2853 | 0 | le = gt; |
2854 | 0 | if (gt->u.s.index >= idx) |
2855 | 0 | break; |
2856 | 0 | } |
2857 | 0 | block = gt; |
2858 | 0 | gt = gt->next; |
2859 | 0 | } |
2860 | 0 | } |
2861 | |
|
2862 | 0 | if (le && le->u.s.index == idx) |
2863 | 0 | { |
2864 | | /* We want to move down into the le block. Does it have a struct |
2865 | | * attached yet? */ |
2866 | 0 | if (le->u.s.down == NULL) |
2867 | 0 | { |
2868 | | /* No. We need to create a new struct node. */ |
2869 | 0 | new_stext_struct(ctx, page, le, standard, raw); |
2870 | 0 | } |
2871 | 0 | else if (le->u.s.down->standard != standard || strcmp(raw, le->u.s.down->raw) != 0) |
2872 | 0 | { |
2873 | | /* Yes, but it doesn't match the one we expect! */ |
2874 | 0 | fz_warn(ctx, "Mismatched structure type!"); |
2875 | 0 | } |
2876 | 0 | page->last_struct = le->u.s.down; |
2877 | 0 | page->last_block = le->u.s.down->last_block; |
2878 | |
|
2879 | 0 | return; |
2880 | 0 | } |
2881 | | |
2882 | | /* We are going to need to create a new block. Create a complete unlinked one here. */ |
2883 | 0 | newblock = fz_new_stext_struct(ctx, page, standard, raw, idx); |
2884 | | |
2885 | | /* So now we just need to link it in somewhere. */ |
2886 | 0 | if (gt) |
2887 | 0 | { |
2888 | | /* Link it in before gt. */ |
2889 | 0 | newblock->prev = gt->prev; |
2890 | 0 | if (gt->prev) |
2891 | 0 | gt->prev->next = newblock; |
2892 | 0 | else if (page->last_struct) |
2893 | 0 | { |
2894 | | /* We're linking it in at the start under another struct! */ |
2895 | 0 | assert(page->last_struct->first_block == gt); |
2896 | 0 | assert(page->last_struct->last_block != NULL); |
2897 | 0 | page->last_struct->first_block = newblock; |
2898 | 0 | } |
2899 | 0 | else |
2900 | 0 | { |
2901 | | /* We're linking it in at the start of the page! */ |
2902 | 0 | assert(page->first_block == gt); |
2903 | 0 | page->first_block = newblock; |
2904 | 0 | } |
2905 | 0 | gt->prev = newblock; |
2906 | 0 | newblock->next = gt; |
2907 | 0 | newblock->id = gt->id; |
2908 | 0 | } |
2909 | 0 | else if (block) |
2910 | 0 | { |
2911 | | /* Link it in at the end of the list (i.e. after 'block') */ |
2912 | 0 | newblock->prev = block; |
2913 | 0 | block->next = newblock; |
2914 | 0 | if (page->last_struct) |
2915 | 0 | { |
2916 | 0 | assert(page->last_struct->last_block == block); |
2917 | 0 | page->last_struct->last_block = newblock; |
2918 | 0 | } |
2919 | 0 | else |
2920 | 0 | { |
2921 | 0 | assert(page->last_block == block); |
2922 | 0 | page->last_block = newblock; |
2923 | 0 | } |
2924 | 0 | newblock->id = block->id; |
2925 | 0 | } |
2926 | 0 | else if (page->last_struct) |
2927 | 0 | { |
2928 | | /* We have no blocks at all at this level. */ |
2929 | 0 | page->last_struct->first_block = newblock; |
2930 | 0 | page->last_struct->last_block = newblock; |
2931 | 0 | newblock->id = page->last_struct->up->id; |
2932 | 0 | } |
2933 | 0 | else |
2934 | 0 | { |
2935 | | /* We have no blocks at ANY level. */ |
2936 | 0 | page->first_block = newblock; |
2937 | | /* newblock will have an id of 0. Best we can do. */ |
2938 | 0 | } |
2939 | | /* Wherever we linked it in, that's where we want to continue adding content. */ |
2940 | 0 | page->last_struct = newblock->u.s.down; |
2941 | 0 | page->last_block = NULL; |
2942 | 0 | } |
2943 | | |
2944 | | static void |
2945 | | fz_stext_end_structure(fz_context *ctx, fz_device *dev) |
2946 | 0 | { |
2947 | 0 | fz_stext_device *tdev = (fz_stext_device*)dev; |
2948 | 0 | fz_stext_page *page = tdev->page; |
2949 | 0 | fz_stext_struct *str = page->last_struct; |
2950 | |
|
2951 | 0 | if (str == NULL) |
2952 | 0 | { |
2953 | 0 | fz_warn(ctx, "Structure out of sync"); |
2954 | 0 | return; |
2955 | 0 | } |
2956 | | |
2957 | 0 | page->last_struct = str->parent; |
2958 | 0 | if (page->last_struct == NULL) |
2959 | 0 | { |
2960 | 0 | page->last_block = page->first_block; |
2961 | | /* Yuck */ |
2962 | 0 | while (page->last_block->next) |
2963 | 0 | page->last_block = page->last_block->next; |
2964 | 0 | } |
2965 | 0 | else |
2966 | 0 | { |
2967 | 0 | page->last_block = page->last_struct->last_block; |
2968 | 0 | } |
2969 | 0 | } |
2970 | | |
2971 | | fz_device * |
2972 | | fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) |
2973 | 0 | { |
2974 | 0 | return fz_new_stext_device_for_page(ctx, page, opts, 0, 0, fz_empty_rect); |
2975 | 0 | } |
2976 | | |
2977 | | fz_device * |
2978 | | fz_new_stext_device_for_page(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts, int chapter_num, int page_num, fz_rect mediabox) |
2979 | 0 | { |
2980 | 0 | fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); |
2981 | |
|
2982 | 0 | dev->super.close_device = fz_stext_close_device; |
2983 | 0 | dev->super.drop_device = fz_stext_drop_device; |
2984 | |
|
2985 | 0 | dev->super.fill_text = fz_stext_fill_text; |
2986 | 0 | dev->super.stroke_text = fz_stext_stroke_text; |
2987 | 0 | dev->super.clip_text = fz_stext_clip_text; |
2988 | 0 | dev->super.clip_stroke_text = fz_stext_clip_stroke_text; |
2989 | 0 | dev->super.ignore_text = fz_stext_ignore_text; |
2990 | 0 | dev->super.begin_metatext = fz_stext_begin_metatext; |
2991 | 0 | dev->super.end_metatext = fz_stext_end_metatext; |
2992 | |
|
2993 | 0 | dev->super.fill_shade = fz_stext_fill_shade; |
2994 | 0 | dev->super.fill_image = fz_stext_fill_image; |
2995 | 0 | dev->super.fill_image_mask = fz_stext_fill_image_mask; |
2996 | |
|
2997 | 0 | if (opts) |
2998 | 0 | { |
2999 | 0 | dev->flags = opts->flags; |
3000 | 0 | if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE) |
3001 | 0 | { |
3002 | 0 | dev->super.begin_structure = fz_stext_begin_structure; |
3003 | 0 | dev->super.end_structure = fz_stext_end_structure; |
3004 | 0 | } |
3005 | 0 | if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES)) |
3006 | 0 | { |
3007 | 0 | dev->super.fill_path = fz_stext_fill_path; |
3008 | 0 | dev->super.stroke_path = fz_stext_stroke_path; |
3009 | 0 | } |
3010 | 0 | } |
3011 | 0 | dev->page = page; |
3012 | 0 | dev->pen.x = 0; |
3013 | 0 | dev->pen.y = 0; |
3014 | 0 | dev->trm = fz_identity; |
3015 | 0 | dev->lastchar = ' '; |
3016 | 0 | dev->lastline = NULL; |
3017 | 0 | dev->lasttext = NULL; |
3018 | 0 | dev->lastbidi = 0; |
3019 | 0 | dev->last_was_fake_bold = 1; |
3020 | 0 | if (opts) |
3021 | 0 | dev->opts = *opts; |
3022 | | |
3023 | | /* If we are ignoring images, then it'd be nice to skip the decode costs. BUT we still need them to tell |
3024 | | * us the bounds for ActualText, so we can only actually skip them if we are ignoring actualtext too. */ |
3025 | 0 | if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && (dev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT) != 0) |
3026 | 0 | dev->super.hints |= FZ_DONT_DECODE_IMAGES; |
3027 | |
|
3028 | 0 | dev->rect_max = 0; |
3029 | 0 | dev->rect_len = 0; |
3030 | 0 | dev->rects = NULL; |
3031 | | |
3032 | | /* Push a new id */ |
3033 | 0 | fz_try(ctx) |
3034 | 0 | { |
3035 | 0 | fz_stext_page_details *deets; |
3036 | 0 | size_t id; |
3037 | 0 | deets = fz_pool_array_append(ctx, page->id_list, &id); |
3038 | 0 | dev->id = (int)id; |
3039 | 0 | deets->mediabox = mediabox; |
3040 | 0 | deets->chapter = chapter_num; |
3041 | 0 | deets->page = page_num; |
3042 | 0 | } |
3043 | 0 | fz_catch(ctx) |
3044 | 0 | { |
3045 | 0 | fz_free(ctx, dev); |
3046 | 0 | fz_rethrow(ctx); |
3047 | 0 | } |
3048 | | |
3049 | 0 | page->mediabox = fz_union_rect(page->mediabox, mediabox); |
3050 | |
|
3051 | 0 | return (fz_device*)dev; |
3052 | 0 | } |