/src/mupdf/source/html/html-outline.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2024 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "html-imp.h" |
25 | | |
26 | | #include <string.h> |
27 | | |
28 | | enum { T, R, B, L }; |
29 | | |
30 | | static int is_internal_uri(const char *uri) |
31 | 0 | { |
32 | 0 | while (*uri >= 'a' && *uri <= 'z') |
33 | 0 | ++uri; |
34 | 0 | if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/') |
35 | 0 | return 0; |
36 | 0 | return 1; |
37 | 0 | } |
38 | | |
39 | | static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file) |
40 | 0 | { |
41 | 0 | fz_link *link; |
42 | 0 | fz_html_flow *next; |
43 | 0 | char path[2048]; |
44 | 0 | fz_rect bbox; |
45 | 0 | const char *dest; |
46 | 0 | const char *href; |
47 | 0 | float end; |
48 | |
|
49 | 0 | float page_y0 = page * page_h; |
50 | 0 | float page_y1 = (page + 1) * page_h; |
51 | |
|
52 | 0 | while (flow) |
53 | 0 | { |
54 | 0 | next = flow->next; |
55 | 0 | if (flow->y >= page_y0 && flow->y <= page_y1) |
56 | 0 | { |
57 | 0 | href = flow->box->href; |
58 | 0 | if (href) |
59 | 0 | { |
60 | | /* Coalesce contiguous flow boxes into one link node */ |
61 | 0 | end = flow->x + flow->w; |
62 | 0 | while (next && |
63 | 0 | next->y == flow->y && |
64 | 0 | next->h == flow->h && |
65 | 0 | next->box->href == href) |
66 | 0 | { |
67 | 0 | end = next->x + next->w; |
68 | 0 | next = next->next; |
69 | 0 | } |
70 | |
|
71 | 0 | bbox.x0 = flow->x; |
72 | 0 | bbox.y0 = flow->y - page * page_h; |
73 | 0 | bbox.x1 = end; |
74 | 0 | bbox.y1 = bbox.y0 + flow->h; |
75 | 0 | if (flow->type != FLOW_IMAGE) |
76 | 0 | { |
77 | | /* flow->y is the baseline, adjust bbox appropriately */ |
78 | 0 | bbox.y0 -= 0.8f * flow->h; |
79 | 0 | bbox.y1 -= 0.8f * flow->h; |
80 | 0 | } |
81 | |
|
82 | 0 | if (is_internal_uri(href)) |
83 | 0 | { |
84 | 0 | if (href[0] == '#') |
85 | 0 | { |
86 | 0 | fz_strlcpy(path, file, sizeof path); |
87 | 0 | fz_strlcat(path, href, sizeof path); |
88 | 0 | } |
89 | 0 | else |
90 | 0 | { |
91 | 0 | fz_strlcpy(path, dir, sizeof path); |
92 | 0 | fz_strlcat(path, "/", sizeof path); |
93 | 0 | fz_strlcat(path, href, sizeof path); |
94 | 0 | } |
95 | 0 | fz_urldecode(path); |
96 | 0 | fz_cleanname(path); |
97 | |
|
98 | 0 | dest = path; |
99 | 0 | } |
100 | 0 | else |
101 | 0 | { |
102 | 0 | dest = href; |
103 | 0 | } |
104 | |
|
105 | 0 | link = fz_new_derived_link(ctx, fz_link, bbox, dest); |
106 | 0 | link->next = head; |
107 | 0 | head = link; |
108 | 0 | } |
109 | 0 | } |
110 | 0 | flow = next; |
111 | 0 | } |
112 | 0 | return head; |
113 | 0 | } |
114 | | |
115 | | static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file) |
116 | 0 | { |
117 | 0 | while (box) |
118 | 0 | { |
119 | 0 | if (box->type == BOX_FLOW) |
120 | 0 | head = load_link_flow(ctx, box->u.flow.head, head, page, page_h, dir, file); |
121 | 0 | if (box->down) |
122 | 0 | head = load_link_box(ctx, box->down, head, page, page_h, dir, file); |
123 | 0 | box = box->next; |
124 | 0 | } |
125 | 0 | return head; |
126 | 0 | } |
127 | | |
128 | | fz_link * |
129 | | fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file) |
130 | 0 | { |
131 | 0 | fz_link *link, *head; |
132 | 0 | char dir[2048]; |
133 | 0 | fz_dirname(dir, file, sizeof dir); |
134 | |
|
135 | 0 | head = load_link_box(ctx, html->tree.root, NULL, page, html->page_h, dir, file); |
136 | |
|
137 | 0 | for (link = head; link; link = link->next) |
138 | 0 | { |
139 | | /* Adjust for page margins */ |
140 | 0 | link->rect.x0 += html->page_margin[L]; |
141 | 0 | link->rect.x1 += html->page_margin[L]; |
142 | 0 | link->rect.y0 += html->page_margin[T]; |
143 | 0 | link->rect.y1 += html->page_margin[T]; |
144 | 0 | } |
145 | |
|
146 | 0 | return head; |
147 | 0 | } |
148 | | |
149 | | static fz_html_flow * |
150 | | find_first_content(fz_html_box *box) |
151 | 0 | { |
152 | 0 | while (box) |
153 | 0 | { |
154 | 0 | if (box->type == BOX_FLOW) |
155 | 0 | return box->u.flow.head; |
156 | 0 | box = box->down; |
157 | 0 | } |
158 | 0 | return NULL; |
159 | 0 | } |
160 | | |
161 | | static float |
162 | | find_flow_target(fz_html_flow *flow, const char *id) |
163 | 0 | { |
164 | 0 | while (flow) |
165 | 0 | { |
166 | 0 | if (flow->box->id && !strcmp(id, flow->box->id)) |
167 | 0 | return flow->y; |
168 | 0 | flow = flow->next; |
169 | 0 | } |
170 | 0 | return -1; |
171 | 0 | } |
172 | | |
173 | | static float |
174 | | find_box_target(fz_html_box *box, const char *id) |
175 | 0 | { |
176 | 0 | float y; |
177 | 0 | while (box) |
178 | 0 | { |
179 | 0 | if (box->id && !strcmp(id, box->id)) |
180 | 0 | { |
181 | 0 | fz_html_flow *flow = find_first_content(box); |
182 | 0 | if (flow) |
183 | 0 | return flow->y; |
184 | 0 | return box->s.layout.y; |
185 | 0 | } |
186 | 0 | if (box->type == BOX_FLOW) |
187 | 0 | { |
188 | 0 | y = find_flow_target(box->u.flow.head, id); |
189 | 0 | if (y >= 0) |
190 | 0 | return y; |
191 | 0 | } |
192 | 0 | else |
193 | 0 | { |
194 | 0 | y = find_box_target(box->down, id); |
195 | 0 | if (y >= 0) |
196 | 0 | return y; |
197 | 0 | } |
198 | 0 | box = box->next; |
199 | 0 | } |
200 | 0 | return -1; |
201 | 0 | } |
202 | | |
203 | | float |
204 | | fz_find_html_target(fz_context *ctx, fz_html *html, const char *id) |
205 | 0 | { |
206 | 0 | return find_box_target(html->tree.root, id); |
207 | 0 | } |
208 | | |
209 | | static fz_html_flow * |
210 | | make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y, fz_html_flow **candidate) |
211 | 0 | { |
212 | 0 | while (flow) |
213 | 0 | { |
214 | 0 | *candidate = flow; |
215 | 0 | if (flow->y >= y) |
216 | 0 | return flow; |
217 | 0 | flow = flow->next; |
218 | 0 | } |
219 | 0 | return NULL; |
220 | 0 | } |
221 | | |
222 | | static fz_html_flow * |
223 | | make_box_bookmark(fz_context *ctx, fz_html_box *box, float y, fz_html_flow **candidate) |
224 | 0 | { |
225 | 0 | fz_html_flow *mark; |
226 | 0 | fz_html_flow *dummy = NULL; |
227 | 0 | if (candidate == NULL) |
228 | 0 | candidate = &dummy; |
229 | 0 | while (box) |
230 | 0 | { |
231 | 0 | if (box->type == BOX_FLOW) |
232 | 0 | { |
233 | 0 | if (box->s.layout.y >= y) |
234 | 0 | { |
235 | 0 | mark = make_flow_bookmark(ctx, box->u.flow.head, y, candidate); |
236 | 0 | if (mark) |
237 | 0 | return mark; |
238 | 0 | } |
239 | 0 | else |
240 | 0 | *candidate = make_flow_bookmark(ctx, box->u.flow.head, y, candidate); |
241 | 0 | } |
242 | 0 | else |
243 | 0 | { |
244 | 0 | mark = make_box_bookmark(ctx, box->down, y, candidate); |
245 | 0 | if (mark) |
246 | 0 | return mark; |
247 | 0 | } |
248 | 0 | box = box->next; |
249 | 0 | } |
250 | 0 | return *candidate; |
251 | 0 | } |
252 | | |
253 | | fz_bookmark |
254 | | fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page) |
255 | 0 | { |
256 | 0 | return (fz_bookmark)make_box_bookmark(ctx, html->tree.root, page * html->page_h, NULL); |
257 | 0 | } |
258 | | |
259 | | static int |
260 | | lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark) |
261 | 0 | { |
262 | 0 | while (flow) |
263 | 0 | { |
264 | 0 | if (flow == mark) |
265 | 0 | return 1; |
266 | 0 | flow = flow->next; |
267 | 0 | } |
268 | 0 | return 0; |
269 | 0 | } |
270 | | |
271 | | static int |
272 | | lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark) |
273 | 0 | { |
274 | 0 | while (box) |
275 | 0 | { |
276 | 0 | if (box->type == BOX_FLOW) |
277 | 0 | { |
278 | 0 | if (lookup_flow_bookmark(ctx, box->u.flow.head, mark)) |
279 | 0 | return 1; |
280 | 0 | } |
281 | 0 | else |
282 | 0 | { |
283 | 0 | if (lookup_box_bookmark(ctx, box->down, mark)) |
284 | 0 | return 1; |
285 | 0 | } |
286 | 0 | box = box->next; |
287 | 0 | } |
288 | 0 | return 0; |
289 | 0 | } |
290 | | |
291 | | int |
292 | | fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark) |
293 | 0 | { |
294 | 0 | fz_html_flow *flow = (fz_html_flow*)mark; |
295 | 0 | if (flow && lookup_box_bookmark(ctx, html->tree.root, flow)) |
296 | 0 | return (int)(flow->y / html->page_h); |
297 | 0 | return -1; |
298 | 0 | } |
299 | | |
300 | | struct outline_parser |
301 | | { |
302 | | fz_html *html; |
303 | | fz_buffer *cat; |
304 | | fz_outline *head; |
305 | | fz_outline **tail[6]; |
306 | | fz_outline **down[6]; |
307 | | int level[6]; |
308 | | int current; |
309 | | int id; |
310 | | }; |
311 | | |
312 | | static void |
313 | | cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow) |
314 | 0 | { |
315 | 0 | while (flow) |
316 | 0 | { |
317 | 0 | switch (flow->type) |
318 | 0 | { |
319 | 0 | case FLOW_WORD: |
320 | 0 | fz_append_string(ctx, cat, flow->content.text); |
321 | 0 | break; |
322 | 0 | case FLOW_SPACE: |
323 | 0 | case FLOW_BREAK: |
324 | 0 | fz_append_byte(ctx, cat, ' '); |
325 | 0 | break; |
326 | 0 | default: |
327 | 0 | break; |
328 | 0 | } |
329 | 0 | flow = flow->next; |
330 | 0 | } |
331 | 0 | } |
332 | | |
333 | | static void |
334 | | cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box) |
335 | 0 | { |
336 | 0 | while (box) |
337 | 0 | { |
338 | 0 | if (box->type == BOX_FLOW) |
339 | 0 | cat_html_flow(ctx, cat, box->u.flow.head); |
340 | 0 | cat_html_box(ctx, cat, box->down); |
341 | 0 | box = box->next; |
342 | 0 | } |
343 | 0 | } |
344 | | |
345 | | static const char * |
346 | | cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box) |
347 | 0 | { |
348 | 0 | if (!x->cat) |
349 | 0 | x->cat = fz_new_buffer(ctx, 1024); |
350 | 0 | else |
351 | 0 | fz_clear_buffer(ctx, x->cat); |
352 | |
|
353 | 0 | cat_html_flow(ctx, x->cat, box->u.flow.head); |
354 | 0 | cat_html_box(ctx, x->cat, box->down); |
355 | |
|
356 | 0 | return fz_string_from_buffer(ctx, x->cat); |
357 | 0 | } |
358 | | |
359 | | static void |
360 | | add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) |
361 | 0 | { |
362 | 0 | fz_outline *node; |
363 | 0 | char buf[100]; |
364 | 0 | int heading; |
365 | |
|
366 | 0 | node = fz_new_outline(ctx); |
367 | 0 | fz_try(ctx) |
368 | 0 | { |
369 | 0 | node->title = Memento_label(fz_strdup(ctx, cat_html_text(ctx, x, box)), "outline_title"); |
370 | 0 | if (!box->id) |
371 | 0 | { |
372 | 0 | fz_snprintf(buf, sizeof buf, "'%d", x->id++); |
373 | 0 | box->id = Memento_label(fz_pool_strdup(ctx, x->html->tree.pool, buf), "box_id"); |
374 | 0 | } |
375 | 0 | node->uri = Memento_label(fz_asprintf(ctx, "#%s", box->id), "outline_uri"); |
376 | 0 | node->is_open = 1; |
377 | 0 | } |
378 | 0 | fz_catch(ctx) |
379 | 0 | { |
380 | 0 | fz_free(ctx, node); |
381 | 0 | fz_rethrow(ctx); |
382 | 0 | } |
383 | | |
384 | 0 | heading = box->heading; |
385 | 0 | if (x->level[x->current] < heading && x->current < 5) |
386 | 0 | { |
387 | 0 | x->tail[x->current+1] = x->down[x->current]; |
388 | 0 | x->current += 1; |
389 | 0 | } |
390 | 0 | else |
391 | 0 | { |
392 | 0 | while (x->current > 0 && x->level[x->current] > heading) |
393 | 0 | { |
394 | 0 | x->current -= 1; |
395 | 0 | } |
396 | 0 | } |
397 | 0 | x->level[x->current] = heading; |
398 | |
|
399 | 0 | *(x->tail[x->current]) = node; |
400 | 0 | x->tail[x->current] = &node->next; |
401 | 0 | x->down[x->current] = &node->down; |
402 | 0 | } |
403 | | |
404 | | static void |
405 | | load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) |
406 | 0 | { |
407 | 0 | while (box) |
408 | 0 | { |
409 | 0 | int heading = box->heading; |
410 | 0 | if (heading) |
411 | 0 | add_html_outline(ctx, x, box); |
412 | 0 | if (box->down) |
413 | 0 | load_html_outline(ctx, x, box->down); |
414 | 0 | box = box->next; |
415 | 0 | } |
416 | 0 | } |
417 | | |
418 | | fz_outline * |
419 | | fz_load_html_outline(fz_context *ctx, fz_html *html) |
420 | 0 | { |
421 | 0 | struct outline_parser state; |
422 | 0 | state.html = html; |
423 | 0 | state.cat = NULL; |
424 | 0 | state.head = NULL; |
425 | 0 | state.tail[0] = &state.head; |
426 | 0 | state.down[0] = NULL; |
427 | 0 | state.level[0] = 99; |
428 | 0 | state.current = 0; |
429 | 0 | state.id = 1; |
430 | 0 | fz_try(ctx) |
431 | 0 | load_html_outline(ctx, &state, html->tree.root); |
432 | 0 | fz_always(ctx) |
433 | 0 | fz_drop_buffer(ctx, state.cat); |
434 | 0 | fz_catch(ctx) |
435 | 0 | { |
436 | 0 | fz_drop_outline(ctx, state.head); |
437 | 0 | state.head = NULL; |
438 | 0 | } |
439 | 0 | return state.head; |
440 | 0 | } |