Coverage Report

Created: 2025-01-28 06:17

/src/mupdf/source/html/html-outline.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2024 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "html-imp.h"
25
26
#include <string.h>
27
28
enum { T, R, B, L };
29
30
static int is_internal_uri(const char *uri)
31
0
{
32
0
  while (*uri >= 'a' && *uri <= 'z')
33
0
    ++uri;
34
0
  if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/')
35
0
    return 0;
36
0
  return 1;
37
0
}
38
39
static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file)
40
0
{
41
0
  fz_link *link;
42
0
  fz_html_flow *next;
43
0
  char path[2048];
44
0
  fz_rect bbox;
45
0
  const char *dest;
46
0
  const char *href;
47
0
  float end;
48
49
0
  float page_y0 = page * page_h;
50
0
  float page_y1 = (page + 1) * page_h;
51
52
0
  while (flow)
53
0
  {
54
0
    next = flow->next;
55
0
    if (flow->y >= page_y0 && flow->y <= page_y1)
56
0
    {
57
0
      href = flow->box->href;
58
0
      if (href)
59
0
      {
60
        /* Coalesce contiguous flow boxes into one link node */
61
0
        end = flow->x + flow->w;
62
0
        while (next &&
63
0
          next->y == flow->y &&
64
0
          next->h == flow->h &&
65
0
          next->box->href == href)
66
0
        {
67
0
          end = next->x + next->w;
68
0
          next = next->next;
69
0
        }
70
71
0
        bbox.x0 = flow->x;
72
0
        bbox.y0 = flow->y - page * page_h;
73
0
        bbox.x1 = end;
74
0
        bbox.y1 = bbox.y0 + flow->h;
75
0
        if (flow->type != FLOW_IMAGE)
76
0
        {
77
          /* flow->y is the baseline, adjust bbox appropriately */
78
0
          bbox.y0 -= 0.8f * flow->h;
79
0
          bbox.y1 -= 0.8f * flow->h;
80
0
        }
81
82
0
        if (is_internal_uri(href))
83
0
        {
84
0
          if (href[0] == '#')
85
0
          {
86
0
            fz_strlcpy(path, file, sizeof path);
87
0
            fz_strlcat(path, href, sizeof path);
88
0
          }
89
0
          else
90
0
          {
91
0
            fz_strlcpy(path, dir, sizeof path);
92
0
            fz_strlcat(path, "/", sizeof path);
93
0
            fz_strlcat(path, href, sizeof path);
94
0
          }
95
0
          fz_urldecode(path);
96
0
          fz_cleanname(path);
97
98
0
          dest = path;
99
0
        }
100
0
        else
101
0
        {
102
0
          dest = href;
103
0
        }
104
105
0
        link = fz_new_derived_link(ctx, fz_link, bbox, dest);
106
0
        link->next = head;
107
0
        head = link;
108
0
      }
109
0
    }
110
0
    flow = next;
111
0
  }
112
0
  return head;
113
0
}
114
115
static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file)
116
0
{
117
0
  while (box)
118
0
  {
119
0
    if (box->type == BOX_FLOW)
120
0
      head = load_link_flow(ctx, box->u.flow.head, head, page, page_h, dir, file);
121
0
    if (box->down)
122
0
      head = load_link_box(ctx, box->down, head, page, page_h, dir, file);
123
0
    box = box->next;
124
0
  }
125
0
  return head;
126
0
}
127
128
fz_link *
129
fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file)
130
0
{
131
0
  fz_link *link, *head;
132
0
  char dir[2048];
133
0
  fz_dirname(dir, file, sizeof dir);
134
135
0
  head = load_link_box(ctx, html->tree.root, NULL, page, html->page_h, dir, file);
136
137
0
  for (link = head; link; link = link->next)
138
0
  {
139
    /* Adjust for page margins */
140
0
    link->rect.x0 += html->page_margin[L];
141
0
    link->rect.x1 += html->page_margin[L];
142
0
    link->rect.y0 += html->page_margin[T];
143
0
    link->rect.y1 += html->page_margin[T];
144
0
  }
145
146
0
  return head;
147
0
}
148
149
static fz_html_flow *
150
find_first_content(fz_html_box *box)
151
0
{
152
0
  while (box)
153
0
  {
154
0
    if (box->type == BOX_FLOW)
155
0
      return box->u.flow.head;
156
0
    box = box->down;
157
0
  }
158
0
  return NULL;
159
0
}
160
161
static float
162
find_flow_target(fz_html_flow *flow, const char *id)
163
0
{
164
0
  while (flow)
165
0
  {
166
0
    if (flow->box->id && !strcmp(id, flow->box->id))
167
0
      return flow->y;
168
0
    flow = flow->next;
169
0
  }
170
0
  return -1;
171
0
}
172
173
static float
174
find_box_target(fz_html_box *box, const char *id)
175
0
{
176
0
  float y;
177
0
  while (box)
178
0
  {
179
0
    if (box->id && !strcmp(id, box->id))
180
0
    {
181
0
      fz_html_flow *flow = find_first_content(box);
182
0
      if (flow)
183
0
        return flow->y;
184
0
      return box->s.layout.y;
185
0
    }
186
0
    if (box->type == BOX_FLOW)
187
0
    {
188
0
      y = find_flow_target(box->u.flow.head, id);
189
0
      if (y >= 0)
190
0
        return y;
191
0
    }
192
0
    else
193
0
    {
194
0
      y = find_box_target(box->down, id);
195
0
      if (y >= 0)
196
0
        return y;
197
0
    }
198
0
    box = box->next;
199
0
  }
200
0
  return -1;
201
0
}
202
203
float
204
fz_find_html_target(fz_context *ctx, fz_html *html, const char *id)
205
0
{
206
0
  return find_box_target(html->tree.root, id);
207
0
}
208
209
static fz_html_flow *
210
make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y, fz_html_flow **candidate)
211
0
{
212
0
  while (flow)
213
0
  {
214
0
    *candidate = flow;
215
0
    if (flow->y >= y)
216
0
      return flow;
217
0
    flow = flow->next;
218
0
  }
219
0
  return NULL;
220
0
}
221
222
static fz_html_flow *
223
make_box_bookmark(fz_context *ctx, fz_html_box *box, float y, fz_html_flow **candidate)
224
0
{
225
0
  fz_html_flow *mark;
226
0
  fz_html_flow *dummy = NULL;
227
0
  if (candidate == NULL)
228
0
    candidate = &dummy;
229
0
  while (box)
230
0
  {
231
0
    if (box->type == BOX_FLOW)
232
0
    {
233
0
      if (box->s.layout.y >= y)
234
0
      {
235
0
        mark = make_flow_bookmark(ctx, box->u.flow.head, y, candidate);
236
0
        if (mark)
237
0
          return mark;
238
0
      }
239
0
      else
240
0
        *candidate = make_flow_bookmark(ctx, box->u.flow.head, y, candidate);
241
0
    }
242
0
    else
243
0
    {
244
0
      mark = make_box_bookmark(ctx, box->down, y, candidate);
245
0
      if (mark)
246
0
        return mark;
247
0
    }
248
0
    box = box->next;
249
0
  }
250
0
  return *candidate;
251
0
}
252
253
fz_bookmark
254
fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page)
255
0
{
256
0
  return (fz_bookmark)make_box_bookmark(ctx, html->tree.root, page * html->page_h, NULL);
257
0
}
258
259
static int
260
lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark)
261
0
{
262
0
  while (flow)
263
0
  {
264
0
    if (flow == mark)
265
0
      return 1;
266
0
    flow = flow->next;
267
0
  }
268
0
  return 0;
269
0
}
270
271
static int
272
lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark)
273
0
{
274
0
  while (box)
275
0
  {
276
0
    if (box->type == BOX_FLOW)
277
0
    {
278
0
      if (lookup_flow_bookmark(ctx, box->u.flow.head, mark))
279
0
        return 1;
280
0
    }
281
0
    else
282
0
    {
283
0
      if (lookup_box_bookmark(ctx, box->down, mark))
284
0
        return 1;
285
0
    }
286
0
    box = box->next;
287
0
  }
288
0
  return 0;
289
0
}
290
291
int
292
fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark)
293
0
{
294
0
  fz_html_flow *flow = (fz_html_flow*)mark;
295
0
  if (flow && lookup_box_bookmark(ctx, html->tree.root, flow))
296
0
    return (int)(flow->y / html->page_h);
297
0
  return -1;
298
0
}
299
300
struct outline_parser
301
{
302
  fz_html *html;
303
  fz_buffer *cat;
304
  fz_outline *head;
305
  fz_outline **tail[6];
306
  fz_outline **down[6];
307
  int level[6];
308
  int current;
309
  int id;
310
};
311
312
static void
313
cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow)
314
0
{
315
0
  while (flow)
316
0
  {
317
0
    switch (flow->type)
318
0
    {
319
0
    case FLOW_WORD:
320
0
      fz_append_string(ctx, cat, flow->content.text);
321
0
      break;
322
0
    case FLOW_SPACE:
323
0
    case FLOW_BREAK:
324
0
      fz_append_byte(ctx, cat, ' ');
325
0
      break;
326
0
    default:
327
0
      break;
328
0
    }
329
0
    flow = flow->next;
330
0
  }
331
0
}
332
333
static void
334
cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box)
335
0
{
336
0
  while (box)
337
0
  {
338
0
    if (box->type == BOX_FLOW)
339
0
      cat_html_flow(ctx, cat, box->u.flow.head);
340
0
    cat_html_box(ctx, cat, box->down);
341
0
    box = box->next;
342
0
  }
343
0
}
344
345
static const char *
346
cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
347
0
{
348
0
  if (!x->cat)
349
0
    x->cat = fz_new_buffer(ctx, 1024);
350
0
  else
351
0
    fz_clear_buffer(ctx, x->cat);
352
353
0
  cat_html_flow(ctx, x->cat, box->u.flow.head);
354
0
  cat_html_box(ctx, x->cat, box->down);
355
356
0
  return fz_string_from_buffer(ctx, x->cat);
357
0
}
358
359
static void
360
add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
361
0
{
362
0
  fz_outline *node;
363
0
  char buf[100];
364
0
  int heading;
365
366
0
  node = fz_new_outline(ctx);
367
0
  fz_try(ctx)
368
0
  {
369
0
    node->title = Memento_label(fz_strdup(ctx, cat_html_text(ctx, x, box)), "outline_title");
370
0
    if (!box->id)
371
0
    {
372
0
      fz_snprintf(buf, sizeof buf, "'%d", x->id++);
373
0
      box->id = Memento_label(fz_pool_strdup(ctx, x->html->tree.pool, buf), "box_id");
374
0
    }
375
0
    node->uri = Memento_label(fz_asprintf(ctx, "#%s", box->id), "outline_uri");
376
0
    node->is_open = 1;
377
0
  }
378
0
  fz_catch(ctx)
379
0
  {
380
0
    fz_free(ctx, node);
381
0
    fz_rethrow(ctx);
382
0
  }
383
384
0
  heading = box->heading;
385
0
  if (x->level[x->current] < heading && x->current < 5)
386
0
  {
387
0
    x->tail[x->current+1] = x->down[x->current];
388
0
    x->current += 1;
389
0
  }
390
0
  else
391
0
  {
392
0
    while (x->current > 0 && x->level[x->current] > heading)
393
0
    {
394
0
      x->current -= 1;
395
0
    }
396
0
  }
397
0
  x->level[x->current] = heading;
398
399
0
  *(x->tail[x->current]) = node;
400
0
  x->tail[x->current] = &node->next;
401
0
  x->down[x->current] = &node->down;
402
0
}
403
404
static void
405
load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
406
0
{
407
0
  while (box)
408
0
  {
409
0
    int heading = box->heading;
410
0
    if (heading)
411
0
      add_html_outline(ctx, x, box);
412
0
    if (box->down)
413
0
      load_html_outline(ctx, x, box->down);
414
0
    box = box->next;
415
0
  }
416
0
}
417
418
fz_outline *
419
fz_load_html_outline(fz_context *ctx, fz_html *html)
420
0
{
421
0
  struct outline_parser state;
422
0
  state.html = html;
423
0
  state.cat = NULL;
424
0
  state.head = NULL;
425
0
  state.tail[0] = &state.head;
426
0
  state.down[0] = NULL;
427
0
  state.level[0] = 99;
428
0
  state.current = 0;
429
0
  state.id = 1;
430
0
  fz_try(ctx)
431
0
    load_html_outline(ctx, &state, html->tree.root);
432
0
  fz_always(ctx)
433
0
    fz_drop_buffer(ctx, state.cat);
434
0
  fz_catch(ctx)
435
0
  {
436
0
    fz_drop_outline(ctx, state.head);
437
0
    state.head = NULL;
438
0
  }
439
0
  return state.head;
440
0
}