Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/source/html/html-parse.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2023 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/ucdn.h"
25
#include "html-imp.h"
26
27
#include <string.h>
28
#include <stdio.h>
29
#include <assert.h>
30
31
enum { T, R, B, L };
32
33
53
#define DEFAULT_DIR FZ_BIDI_LTR
34
35
static const char *html_default_css =
36
"@page{margin:3em 2em}"
37
"a{color:#06C;text-decoration:underline}"
38
"address{display:block;font-style:italic}"
39
"b{font-weight:bold}"
40
"bdo{direction:rtl;unicode-bidi:bidi-override}"
41
"blockquote{display:block;margin:1em 40px}"
42
"body{display:block;margin:1em}"
43
"cite{font-style:italic}"
44
"code{font-family:monospace}"
45
"dd{display:block;margin:0 0 0 40px}"
46
"del{text-decoration:line-through}"
47
"div{display:block}"
48
"dl{display:block;margin:1em 0}"
49
"dt{display:block}"
50
"em{font-style:italic}"
51
"h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}"
52
"h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}"
53
"h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}"
54
"h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}"
55
"h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}"
56
"h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}"
57
"head{display:none}"
58
"hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}"
59
"html{display:block}"
60
"i{font-style:italic}"
61
"ins{text-decoration:underline}"
62
"kbd{font-family:monospace}"
63
"li{display:list-item}"
64
"menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
65
"ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}"
66
"p{display:block;margin:1em 0}"
67
"pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}"
68
"samp{font-family:monospace}"
69
"script{display:none}"
70
"small{font-size:0.83em}"
71
"strong{font-weight:bold}"
72
"style{display:none}"
73
"sub{font-size:0.83em;vertical-align:sub}"
74
"sup{font-size:0.83em;vertical-align:super}"
75
"table{display:table;border-spacing:2px}"
76
"tbody{display:table-row-group}"
77
"td{display:table-cell;padding:1px;background-color:inherit}"
78
"tfoot{display:table-footer-group}"
79
"th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}"
80
"thead{display:table-header-group}"
81
"tr{display:table-row}"
82
"ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
83
"ul ul{list-style-type:circle}"
84
"ul ul ul{list-style-type:square}"
85
"var{font-style:italic}"
86
"colgroup{display:table-column-group}"
87
"col{display:table-column}"
88
"caption{display:block;text-align:center}"
89
;
90
91
static const char *mobi_default_css =
92
"pagebreak{display:block;page-break-before:always}"
93
"dl,ol,ul{margin:0}"
94
"p{margin:0}"
95
"blockquote{margin:0 40px}"
96
"center{display:block;text-align:center}"
97
"big{font-size:1.17em}"
98
"strike{text-decoration:line-through}"
99
;
100
101
static const char *fb2_default_css =
102
"@page{margin:3em 2em}"
103
"FictionBook{display:block;margin:1em}"
104
"stylesheet,binary{display:none}"
105
"description>*{display:none}"
106
"description>title-info{display:block}"
107
"description>title-info>*{display:none}"
108
"description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}"
109
"body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}"
110
"image{display:block}"
111
"p>image{display:inline}"
112
"table{display:table}"
113
"tr{display:table-row}"
114
"th,td{display:table-cell}"
115
"a{color:#06C;text-decoration:underline}"
116
"a[type=note]{font-size:small;vertical-align:super}"
117
"code{white-space:pre;font-family:monospace}"
118
"emphasis{font-style:italic}"
119
"strikethrough{text-decoration:line-through}"
120
"strong{font-weight:bold}"
121
"sub{font-size:small;vertical-align:sub}"
122
"sup{font-size:small;vertical-align:super}"
123
"image{margin:1em 0;text-align:center}"
124
"cite,poem{margin:1em 2em}"
125
"subtitle,epigraph,stanza{margin:1em 0}"
126
"title>p{text-align:center;font-size:x-large}"
127
"subtitle{text-align:center;font-size:large}"
128
"p{margin-top:1em;text-align:justify}"
129
"empty-line{padding-top:1em}"
130
"p+p{margin-top:0;text-indent:1.5em}"
131
"empty-line+p{margin-top:0}"
132
"section>title{page-break-before:always}"
133
;
134
135
static const char *known_html_tags[] = {
136
  // TODO: add known FB2 tags?
137
  // Sorted list of all HTML tags.
138
  "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area",
139
  "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo",
140
  "bgsound", "big", "blink", "blockquote", "body", "br", "button",
141
  "canvas", "caption", "center", "cite", "code", "col", "colgroup",
142
  "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir",
143
  "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure",
144
  "font", "footer", "foreignobject", "form", "frame", "frameset", "h1",
145
  "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
146
  "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd",
147
  "keygen", "label", "legend", "li", "link", "listing", "main",
148
  "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem",
149
  "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol",
150
  "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object",
151
  "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre",
152
  "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp",
153
  "script", "section", "select", "small", "source", "spacer", "span",
154
  "strike", "strong", "style", "sub", "summary", "sup", "svg", "table",
155
  "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time",
156
  "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp",
157
};
158
159
static const char *known_fb2_tags[] = {
160
  "FictionBook", "a", "binary", "body", "cite", "code", "coverpage",
161
  "date", "description", "emphasis", "empty-line", "epigraph", "image",
162
  "p", "poem", "section", "stanza", "strikethrough", "strong",
163
  "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author",
164
  "th", "title", "title-info", "tr", "v",
165
};
166
167
static const char *find_known_html_tag(const char *tag)
168
564
{
169
564
  int l = 0;
170
564
  int r = nelem(known_html_tags) / 2 - 1;
171
4.26k
  while (l <= r)
172
3.81k
  {
173
3.81k
    int m = (l + r) >> 1;
174
3.81k
    int c = strcmp(tag, known_html_tags[m]);
175
3.81k
    if (c < 0)
176
322
      r = m - 1;
177
3.49k
    else if (c > 0)
178
3.38k
      l = m + 1;
179
112
    else
180
112
      return known_html_tags[m];
181
3.81k
  }
182
452
  return NULL;
183
564
}
184
185
static const char *find_known_fb2_tag(const char *tag)
186
0
{
187
0
  int l = 0;
188
0
  int r = nelem(known_fb2_tags) / 2 - 1;
189
0
  while (l <= r)
190
0
  {
191
0
    int m = (l + r) >> 1;
192
0
    int c = strcmp(tag, known_fb2_tags[m]);
193
0
    if (c < 0)
194
0
      r = m - 1;
195
0
    else if (c > 0)
196
0
      l = m + 1;
197
0
    else
198
0
      return known_fb2_tags[m];
199
0
  }
200
0
  return NULL;
201
0
}
202
203
struct genstate
204
{
205
  fz_pool *pool;
206
  fz_html_font_set *set;
207
  fz_archive *zip;
208
  fz_tree *images;
209
  fz_xml_doc *xml;
210
  int is_fb2;
211
  const char *base_uri;
212
  fz_css *css;
213
  int at_bol;
214
  fz_html_box *emit_white;
215
  int last_brk_cls;
216
217
  int list_counter;
218
  int section_depth;
219
  fz_bidi_direction markup_dir;
220
  fz_text_language markup_lang;
221
  char *href;
222
223
  fz_css_style_splay *styles;
224
};
225
226
static int iswhite(int c)
227
22.7k
{
228
22.7k
  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
229
22.7k
}
230
231
static int is_all_white(const char *s)
232
628
{
233
1.04k
  while (*s)
234
662
  {
235
662
    if (!iswhite(*s))
236
244
      return 0;
237
418
    ++s;
238
418
  }
239
384
  return 1;
240
628
}
241
242
/* TODO: pool allocator for flow nodes */
243
/* TODO: store text by pointing to a giant buffer */
244
245
static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow)
246
244
{
247
4.60k
  while (flow)
248
4.36k
  {
249
4.36k
    fz_html_flow *next = flow->next;
250
4.36k
    if (flow->type == FLOW_IMAGE)
251
0
      fz_drop_image(ctx, flow->content.image);
252
4.36k
    flow = next;
253
4.36k
  }
254
244
}
255
256
static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras)
257
4.36k
{
258
4.36k
  size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras);
259
4.36k
  fz_html_flow *flow;
260
261
  /* Shouldn't happen, but bug 705324. */
262
4.36k
  if (top == NULL || top->type != BOX_FLOW)
263
0
    return NULL;
264
265
4.36k
  flow = fz_pool_alloc(ctx, pool, size);
266
4.36k
  flow->type = type;
267
4.36k
  flow->expand = 0;
268
4.36k
  flow->bidi_level = 0;
269
4.36k
  flow->markup_lang = 0;
270
4.36k
  flow->breaks_line = 0;
271
4.36k
  flow->box = inline_box;
272
4.36k
  (*top->s.build.flow_tail) = flow;
273
4.36k
  top->s.build.flow_tail = &flow->next;
274
4.36k
  return flow;
275
4.36k
}
276
277
static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
278
1.57k
{
279
1.57k
  fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0);
280
1.57k
  if (flow)
281
1.57k
    flow->expand = 1;
282
1.57k
}
283
284
static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
285
0
{
286
0
  (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0);
287
0
}
288
289
static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
290
484
{
291
484
  (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0);
292
484
}
293
294
static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
295
0
{
296
0
  (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0);
297
0
}
298
299
static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang)
300
2.30k
{
301
2.30k
  fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1);
302
2.30k
  if (flow == NULL)
303
0
    return;
304
2.30k
  memcpy(flow->content.text, a, b - a);
305
2.30k
  flow->content.text[b - a] = 0;
306
2.30k
  flow->markup_lang = lang;
307
2.30k
}
308
309
static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img)
310
0
{
311
0
  fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0);
312
0
  if (flow)
313
0
    flow->content.image = fz_keep_image(ctx, img);
314
0
}
315
316
static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
317
0
{
318
0
  (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0);
319
0
}
320
321
fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
322
0
{
323
0
  fz_html_flow *new_flow;
324
0
  char *text;
325
0
  size_t len;
326
327
0
  assert(flow->type == FLOW_WORD);
328
329
0
  if (offset == 0)
330
0
    return flow;
331
0
  text = flow->content.text;
332
0
  while (*text && offset)
333
0
  {
334
0
    int rune;
335
0
    text += fz_chartorune(&rune, text);
336
0
    offset--;
337
0
  }
338
0
  len = strlen(text);
339
0
  new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1);
340
0
  memcpy(new_flow, flow, offsetof(fz_html_flow, content));
341
0
  new_flow->next = flow->next;
342
0
  flow->next = new_flow;
343
0
  strcpy(new_flow->content.text, text);
344
0
  *text = 0;
345
0
  return new_flow;
346
0
}
347
348
static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g)
349
1.81k
{
350
1.81k
  static const char *space = " ";
351
1.81k
  fz_pool *pool = g->pool;
352
1.81k
  if (g->emit_white)
353
1.78k
  {
354
1.78k
    int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE;
355
1.78k
    if (!g->at_bol)
356
1.57k
    {
357
1.57k
      if (bsp)
358
1.57k
        add_flow_space(ctx, pool, flow, g->emit_white);
359
0
      else
360
0
        add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang);
361
1.57k
    }
362
1.78k
    g->emit_white = 0;
363
1.78k
  }
364
1.81k
}
365
366
/* pair-wise lookup table for UAX#14 linebreaks */
367
static const char *pairbrk[29] =
368
{
369
/*  -OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */
370
/*  -PLPULSXYSROULLDNYAB2WMJ23LVTI- */
371
  "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
372
  "_^^%%^^^^%%_____%%__^^^______", /* CL close punctuation */
373
  "_^^%%^^^^%%%%%__%%__^^^______", /* CP close parenthesis */
374
  "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* QU quotation */
375
  "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* GL non-breaking glue */
376
  "_^^%%%^^^_______%%__^^^______", /* NS nonstarters */
377
  "_^^%%%^^^______%%%__^^^______", /* EX exclamation/interrogation */
378
  "_^^%%%^^^__%_%__%%__^^^______", /* SY symbols allowing break after */
379
  "_^^%%%^^^__%%%__%%__^^^______", /* IS infix numeric separator */
380
  "%^^%%%^^^__%%%%_%%__^^^%%%%%_", /* PR prefix numeric */
381
  "%^^%%%^^^__%%%__%%__^^^______", /* PO postfix numeric */
382
  "%^^%%%^^^%%%%%_%%%__^^^______", /* NU numeric */
383
  "%^^%%%^^^__%%%_%%%__^^^______", /* AL ordinary alphabetic and symbol characters */
384
  "%^^%%%^^^__%%%_%%%__^^^______", /* HL hebrew letter */
385
  "_^^%%%^^^_%____%%%__^^^______", /* ID ideographic */
386
  "_^^%%%^^^______%%%__^^^______", /* IN inseparable characters */
387
  "_^^%_%^^^__%____%%__^^^______", /* HY hyphens */
388
  "_^^%_%^^^_______%%__^^^______", /* BA break after */
389
  "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* BB break before */
390
  "_^^%%%^^^_______%%_^^^^______", /* B2 break opportunity before and after */
391
  "____________________^________", /* ZW zero width space */
392
  "%^^%%%^^^__%%%_%%%__^^^______", /* CM combining mark */
393
  "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* WJ word joiner */
394
  "_^^%%%^^^_%____%%%__^^^___%%_", /* H2 hangul leading/vowel syllable */
395
  "_^^%%%^^^_%____%%%__^^^____%_", /* H3 hangul leading/vowel/trailing syllable */
396
  "_^^%%%^^^_%____%%%__^^^%%%%__", /* JL hangul leading jamo */
397
  "_^^%%%^^^_%____%%%__^^^___%%_", /* JV hangul vowel jamo */
398
  "_^^%%%^^^_%____%%%__^^^____%_", /* JT hangul trailing jamo */
399
  "_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */
400
};
401
402
static fz_html_box *
403
find_flow_encloser(fz_context *ctx, fz_html_box *flow)
404
244
{
405
  /* This code was written to assume that there will always be a
406
   * flow box enclosing callers of this. Bug 705324 shows that
407
   * this isn't always the case. In the absence of a reproducer
408
   * file, all I can do is try to patch around the issue so that
409
   * we won't crash. */
410
489
  while (flow->type != BOX_FLOW)
411
245
  {
412
245
    if (flow->up == NULL)
413
0
    {
414
0
      fz_warn(ctx, "Flow encloser not found. Please report this file!");
415
0
      break;
416
0
    }
417
245
    flow = flow->up;
418
245
  }
419
244
  return flow;
420
244
}
421
422
static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g)
423
244
{
424
244
  fz_html_box *flow;
425
244
  fz_pool *pool = g->pool;
426
244
  int collapse = box->style->white_space & WS_COLLAPSE;
427
244
  int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
428
244
  int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE;
429
430
244
  static const char *space = " ";
431
432
244
  flow = find_flow_encloser(ctx, box);
433
244
  if (flow == NULL)
434
0
    return;
435
436
3.67k
  while (*text)
437
3.43k
  {
438
3.43k
    if (bnl && (*text == '\n' || *text == '\r'))
439
0
    {
440
0
      if (text[0] == '\r' && text[1] == '\n')
441
0
        text += 2;
442
0
      else
443
0
        text += 1;
444
0
      add_flow_break(ctx, pool, flow, box);
445
0
      g->at_bol = 1;
446
0
    }
447
3.43k
    else if (iswhite(*text))
448
1.61k
    {
449
1.61k
      if (collapse)
450
1.61k
      {
451
1.61k
        if (bnl)
452
0
          while (*text == ' ' || *text == '\t')
453
0
            ++text;
454
1.61k
        else
455
3.42k
          while (iswhite(*text))
456
1.81k
            ++text;
457
1.61k
        g->emit_white = box;
458
1.61k
      }
459
0
      else
460
0
      {
461
        // TODO: tabs
462
0
        if (bsp)
463
0
          add_flow_space(ctx, pool, flow, box);
464
0
        else
465
0
          add_flow_word(ctx, pool, flow, box, space, space+1, lang);
466
0
        ++text;
467
0
      }
468
1.61k
      g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
469
1.61k
    }
470
1.81k
    else
471
1.81k
    {
472
1.81k
      const char *prev, *mark = text;
473
1.81k
      int c;
474
475
1.81k
      flush_space(ctx, flow, lang, g);
476
477
1.81k
      if (g->at_bol)
478
244
        g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
479
480
15.4k
      while (*text && !iswhite(*text))
481
13.6k
      {
482
13.6k
        prev = text;
483
13.6k
        text += fz_chartorune(&c, text);
484
13.6k
        if (c == 0xAD) /* soft hyphen */
485
0
        {
486
0
          if (mark != prev)
487
0
            add_flow_word(ctx, pool, flow, box, mark, prev, lang);
488
0
          add_flow_shyphen(ctx, pool, flow, box);
489
0
          mark = text;
490
0
          g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
491
0
        }
492
13.6k
        else if (bsp) /* allow soft breaks */
493
13.6k
        {
494
13.6k
          int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
495
13.6k
          if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI)
496
13.6k
          {
497
13.6k
            int brk = pairbrk[g->last_brk_cls][this_brk_cls];
498
499
            /* we handle spaces elsewhere, so ignore these classes */
500
13.6k
            if (brk == '@') brk = '^';
501
13.6k
            if (brk == '#') brk = '^';
502
13.6k
            if (brk == '%') brk = '^';
503
504
13.6k
            if (brk == '_')
505
484
            {
506
484
              if (mark != prev)
507
484
                add_flow_word(ctx, pool, flow, box, mark, prev, lang);
508
484
              add_flow_sbreak(ctx, pool, flow, box);
509
484
              mark = prev;
510
484
            }
511
512
13.6k
            g->last_brk_cls = this_brk_cls;
513
13.6k
          }
514
13.6k
        }
515
13.6k
      }
516
1.81k
      if (mark != text)
517
1.81k
        add_flow_word(ctx, pool, flow, box, mark, text, lang);
518
519
1.81k
      g->at_bol = 0;
520
1.81k
    }
521
3.43k
  }
522
244
}
523
524
static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src)
525
0
{
526
0
  char path[2048];
527
0
  fz_image *img = NULL;
528
0
  fz_buffer *buf = NULL;
529
530
0
  fz_var(img);
531
0
  fz_var(buf);
532
533
0
  fz_try(ctx)
534
0
  {
535
0
    if (!strncmp(src, "data:image/jpeg;base64,", 23))
536
0
      buf = fz_new_buffer_from_base64(ctx, src+23, 0);
537
0
    else if (!strncmp(src, "data:image/png;base64,", 22))
538
0
      buf = fz_new_buffer_from_base64(ctx, src+22, 0);
539
0
    else if (!strncmp(src, "data:image/gif;base64,", 22))
540
0
      buf = fz_new_buffer_from_base64(ctx, src+22, 0);
541
0
    else
542
0
    {
543
0
      fz_strlcpy(path, base_uri, sizeof path);
544
0
      fz_strlcat(path, "/", sizeof path);
545
0
      fz_strlcat(path, src, sizeof path);
546
0
      fz_urldecode(path);
547
0
      fz_cleanname(path);
548
0
      buf = fz_read_archive_entry(ctx, zip, path);
549
0
    }
550
0
#if FZ_ENABLE_SVG
551
0
    if (strstr(src, ".svg"))
552
0
      img = fz_new_image_from_svg(ctx, buf, base_uri, zip);
553
0
    else
554
0
#endif
555
0
      img = fz_new_image_from_buffer(ctx, buf);
556
0
  }
557
0
  fz_always(ctx)
558
0
    fz_drop_buffer(ctx, buf);
559
0
  fz_catch(ctx)
560
0
  {
561
0
    fz_ignore_error(ctx);
562
0
    fz_warn(ctx, "html: cannot load image src='%s'", src);
563
0
  }
564
565
0
  return img;
566
0
}
567
568
static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri,
569
  fz_xml_doc *xmldoc, fz_xml *node)
570
0
{
571
0
  fz_image *img = NULL;
572
0
#if FZ_ENABLE_SVG
573
0
  fz_try(ctx)
574
0
    img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip);
575
0
  fz_catch(ctx)
576
0
  {
577
0
    fz_ignore_error(ctx);
578
0
    fz_warn(ctx, "html: cannot load embedded svg document");
579
0
  }
580
0
#endif
581
0
  return img;
582
0
}
583
584
static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g)
585
0
{
586
0
  fz_html_box *flow;
587
0
  fz_pool *pool = g->pool;
588
589
0
  flow = find_flow_encloser(ctx, box);
590
591
0
  flush_space(ctx, flow, 0, g);
592
593
0
  if (!img)
594
0
  {
595
0
    const char *alt = "[image]";
596
0
    add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
597
0
  }
598
0
  else
599
0
  {
600
0
    fz_try(ctx)
601
0
    {
602
0
      add_flow_sbreak(ctx, pool, flow, box);
603
0
      add_flow_image(ctx, pool, flow, box, img);
604
0
      add_flow_sbreak(ctx, pool, flow, box);
605
0
    }
606
0
    fz_always(ctx)
607
0
    {
608
0
      fz_drop_image(ctx, img);
609
0
    }
610
0
    fz_catch(ctx)
611
0
      fz_rethrow(ctx);
612
0
  }
613
614
0
  g->at_bol = 0;
615
0
}
616
617
static void fz_drop_html_box(fz_context *ctx, fz_html_box *box)
618
1.10k
{
619
2.15k
  while (box)
620
1.04k
  {
621
1.04k
    fz_html_box *next = box->next;
622
1.04k
    if (box->type == BOX_FLOW)
623
244
      fz_drop_html_flow(ctx, box->u.flow.head);
624
1.04k
    fz_drop_html_box(ctx, box->down);
625
1.04k
    box = next;
626
1.04k
  }
627
1.10k
}
628
629
static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor)
630
14
{
631
14
  fz_html *html = (fz_html *)stor;
632
14
  fz_drop_html_box(ctx, html->tree.root);
633
14
  fz_drop_pool(ctx, html->tree.pool);
634
14
}
635
636
static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor)
637
39
{
638
39
  fz_story *story = (fz_story *)stor;
639
39
  fz_free(ctx, story->user_css);
640
39
  fz_drop_html_font_set(ctx, story->font_set);
641
39
  fz_drop_xml(ctx, story->dom);
642
39
  fz_drop_html_box(ctx, story->tree.root);
643
39
  fz_drop_buffer(ctx, story->warnings);
644
39
  fz_drop_archive(ctx, story->zip);
645
  /* The pool must be the last thing dropped. */
646
39
  fz_drop_pool(ctx, story->tree.pool);
647
39
}
648
649
/* Drop a structure derived from an html_tree. The exact things
650
 * freed here will depend upon the drop function with which it
651
 * was created. */
652
static void
653
fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree)
654
107
{
655
107
  fz_defer_reap_start(ctx);
656
107
  fz_drop_storable(ctx, &tree->storable);
657
107
  fz_defer_reap_end(ctx);
658
107
}
659
660
void fz_drop_html(fz_context *ctx, fz_html *html)
661
68
{
662
68
  fz_drop_html_tree(ctx, &html->tree);
663
68
}
664
665
void fz_drop_story(fz_context *ctx, fz_story *story)
666
39
{
667
39
  if (!story)
668
0
    return;
669
670
39
  fz_drop_html_tree(ctx, &story->tree);
671
39
}
672
673
fz_html *fz_keep_html(fz_context *ctx, fz_html *html)
674
0
{
675
0
  return fz_keep_storable(ctx, &html->tree.storable);
676
0
}
677
678
static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style)
679
1.04k
{
680
1.04k
  fz_html_box *box;
681
1.04k
  const char *tag = fz_xml_tag(node);
682
1.04k
  const char *id = fz_xml_att(node, "id");
683
1.04k
  const char *href;
684
685
1.04k
  if (type == BOX_INLINE)
686
246
    box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u));
687
803
  else if (type == BOX_FLOW)
688
244
    box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow));
689
559
  else
690
559
    box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block));
691
692
1.04k
  box->type = type;
693
1.04k
  box->is_first_flow = 0;
694
1.04k
  box->markup_dir = g->markup_dir;
695
1.04k
  box->heading = 0;
696
1.04k
  box->list_item = 0;
697
698
1.04k
  box->style = fz_css_enlist(ctx, style, &g->styles, g->pool);
699
700
1.04k
  if (tag)
701
564
  {
702
564
    box->tag = find_known_html_tag(tag);
703
564
    if (!box->tag && g->is_fb2)
704
0
      box->tag = find_known_fb2_tag(tag);
705
564
    if (!box->tag)
706
452
      box->tag = fz_pool_strdup(ctx, g->pool, tag);
707
564
  }
708
485
  else
709
485
  {
710
485
    box->tag = "#anon";
711
485
  }
712
713
1.04k
  if (id)
714
4
    box->id = fz_pool_strdup(ctx, g->pool, id);
715
716
1.04k
  if (tag && tag[0]=='a' && tag[1]==0)
717
0
  {
718
    // Support deprecated anchor syntax with id in "name" instead of "id" attribute.
719
0
    if (!id)
720
0
    {
721
0
      const char *name = fz_xml_att(node, "name");
722
0
      if (name)
723
0
        box->id = fz_pool_strdup(ctx, g->pool, name);
724
0
    }
725
726
0
    if (g->is_fb2)
727
0
    {
728
0
      href = fz_xml_att(node, "l:href");
729
0
      if (!href)
730
0
        href = fz_xml_att(node, "xlink:href");
731
0
    }
732
0
    else
733
0
    {
734
0
      href = fz_xml_att(node, "href");
735
0
    }
736
0
    if (href)
737
0
      g->href = fz_pool_strdup(ctx, g->pool, href);
738
0
  }
739
740
1.04k
  if (g->href)
741
0
    box->href = g->href;
742
743
1.04k
  if (type == BOX_FLOW)
744
244
  {
745
244
    box->u.flow.head = NULL;
746
244
    box->s.build.flow_tail = &box->u.flow.head;
747
244
  }
748
749
1.04k
  return box;
750
1.04k
}
751
752
static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child)
753
996
{
754
996
  child->up = parent;
755
996
  if (!parent->down)
756
638
    parent->down = child;
757
996
  if (parent->s.build.last_child)
758
358
    parent->s.build.last_child->next = child;
759
996
  parent->s.build.last_child = child;
760
996
}
761
762
static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box)
763
506
{
764
506
  while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
765
0
    box = box->up;
766
506
  return box;
767
506
}
768
769
static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box)
770
0
{
771
0
  fz_html_box *look = box;
772
0
  while (look && look->type != BOX_TABLE)
773
0
    look = look->up;
774
0
  if (look)
775
0
    return look;
776
0
  fz_warn(ctx, "table-row not inside table element");
777
0
  return NULL;
778
0
}
779
780
static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box)
781
0
{
782
0
  fz_html_box *look = box;
783
0
  while (look && look->type != BOX_TABLE_ROW)
784
0
    look = look->up;
785
0
  if (look)
786
0
    return look;
787
0
  fz_warn(ctx, "table-cell not inside table-row element");
788
0
  return NULL;
789
0
}
790
791
static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box)
792
246
{
793
246
  fz_css_style style;
794
246
  fz_html_box *flow_box;
795
796
246
  if (box->type == BOX_FLOW || box->type == BOX_INLINE)
797
2
    return box;
798
799
  // We have an inline element that is not in an existing flow/inline context.
800
801
  // Find the closest block level box to insert content into.
802
244
  while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
803
0
    box = box->up;
804
805
  // Concatenate onto the last open flow box if we have one.
806
244
  if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW)
807
0
    return box->s.build.last_child;
808
809
  // No flow box found, create and insert one!
810
811
  // TODO: null style instead of default for flow box?
812
244
  fz_default_css_style(ctx, &style);
813
244
  flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style);
814
244
  flow_box->is_first_flow = !box->down;
815
244
  g->at_bol = 1;
816
817
244
  append_box(ctx, box, flow_box);
818
819
244
  return flow_box;
820
244
}
821
822
static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match);
823
824
static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
825
628
{
826
628
  fz_html_box *anon_box;
827
628
  fz_css_style style;
828
628
  const char *text;
829
628
  int collapse;
830
831
628
  text = fz_xml_text(node);
832
628
  collapse = root_box->style->white_space & WS_COLLAPSE;
833
628
  if (collapse && is_all_white(text))
834
384
  {
835
384
    g->emit_white = root_box;
836
384
  }
837
244
  else
838
244
  {
839
244
    if (root_box->type != BOX_INLINE)
840
188
    {
841
      /* Create anonymous inline box, with the same style as the top block box. */
842
188
      style = *root_box->style;
843
844
      // Make sure not to recursively multiply font sizes
845
188
      style.font_size.value = 1;
846
188
      style.font_size.unit = N_SCALE;
847
848
188
      root_box = find_inline_context(ctx, g, root_box);
849
188
      anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style);
850
188
      append_box(ctx, root_box, anon_box);
851
188
      root_box = anon_box;
852
188
    }
853
854
244
    generate_text(ctx, root_box, text, g->markup_lang, g);
855
244
  }
856
628
}
857
858
static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
859
58
{
860
58
  fz_html_box *this_box;
861
58
  fz_html_box *flow_box;
862
58
  root_box = find_inline_context(ctx, g, root_box);
863
58
  this_box = new_box(ctx, g, node, BOX_INLINE, style);
864
58
  append_box(ctx, root_box, this_box);
865
58
  if (this_box->id)
866
0
  {
867
0
    flow_box = find_flow_encloser(ctx, this_box);
868
0
    add_flow_anchor(ctx, g->pool, flow_box, this_box);
869
0
  }
870
58
  return this_box;
871
58
}
872
873
static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
874
0
{
875
0
  fz_html_box *this_box;
876
0
  fz_html_box *flow_box;
877
878
0
  if (root_box->type != BOX_INLINE)
879
0
  {
880
    /* Create inline box to hold the <br> tag, with the same style as containing block. */
881
    /* Make sure not to recursively multiply font sizes. */
882
0
    fz_css_style style = *root_box->style;
883
0
    style.font_size.value = 1;
884
0
    style.font_size.unit = N_SCALE;
885
0
    this_box = new_box(ctx, g, node, BOX_INLINE, &style);
886
0
    append_box(ctx, find_inline_context(ctx, g, root_box), this_box);
887
0
  }
888
0
  else
889
0
  {
890
0
    this_box = root_box;
891
0
  }
892
893
0
  flow_box = find_flow_encloser(ctx, this_box);
894
0
  add_flow_break(ctx, g->pool, flow_box, this_box);
895
0
  g->at_bol = 1;
896
0
}
897
898
static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
899
506
{
900
506
  fz_html_box *this_box;
901
506
  root_box = find_block_context(ctx, root_box);
902
506
  this_box = new_box(ctx, g, node, BOX_BLOCK, style);
903
506
  append_box(ctx, root_box, this_box);
904
506
  return this_box;
905
506
}
906
907
static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
908
0
{
909
0
  fz_html_box *this_box;
910
0
  root_box = find_block_context(ctx, root_box);
911
0
  this_box = new_box(ctx, g, node, BOX_TABLE, style);
912
0
  append_box(ctx, root_box, this_box);
913
0
  return this_box;
914
0
}
915
916
static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
917
0
{
918
0
  fz_html_box *this_box, *table_box;
919
920
0
  table_box = find_table_row_context(ctx, root_box);
921
0
  if (!table_box)
922
0
    return gen2_block(ctx, g, root_box, node, style);
923
924
0
  this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style);
925
0
  append_box(ctx, table_box, this_box);
926
0
  return this_box;
927
0
}
928
929
static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
930
0
{
931
0
  fz_html_box *this_box, *row_box;
932
933
0
  row_box = find_table_cell_context(ctx, root_box);
934
0
  if (!row_box)
935
0
    return gen2_block(ctx, g, root_box, node, style);
936
937
0
  this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style);
938
0
  append_box(ctx, row_box, this_box);
939
0
  return this_box;
940
0
}
941
942
static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style)
943
0
{
944
0
  fz_html_box *img_block_box;
945
0
  fz_html_box *img_inline_box;
946
947
0
  if (display == DIS_INLINE || display == DIS_INLINE_BLOCK)
948
0
  {
949
0
    root_box = find_inline_context(ctx, g, root_box);
950
0
    img_inline_box = new_box(ctx, g, node, BOX_INLINE, style);
951
0
    append_box(ctx, root_box, img_inline_box);
952
0
    generate_image(ctx, img_inline_box, img, g);
953
0
  }
954
0
  else
955
0
  {
956
0
    root_box = find_block_context(ctx, root_box);
957
0
    img_block_box = new_box(ctx, g, node, BOX_BLOCK, style);
958
0
    append_box(ctx, root_box, img_block_box);
959
960
0
    root_box = find_inline_context(ctx, g, img_block_box);
961
0
    img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style);
962
0
    append_box(ctx, root_box, img_inline_box);
963
0
    generate_image(ctx, img_inline_box, img, g);
964
0
  }
965
0
}
966
967
static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
968
0
{
969
0
  const char *src = fz_xml_att(node, "src");
970
0
  if (src)
971
0
  {
972
0
    fz_css_style local_style = *style;
973
0
    fz_image *img;
974
0
    int w, h;
975
0
    const char *w_att = fz_xml_att(node, "width");
976
0
    const char *h_att = fz_xml_att(node, "height");
977
978
0
    if (w_att && (w = fz_atoi(w_att)) > 0)
979
0
    {
980
0
      local_style.width.value = w;
981
0
      local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH;
982
0
    }
983
0
    if (h_att && (h = fz_atoi(h_att)) > 0)
984
0
    {
985
0
      local_style.height.value = h;
986
0
      local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH;
987
0
    }
988
989
0
    img = load_html_image(ctx, g->zip, g->base_uri, src);
990
0
    gen2_image_common(ctx, g, root_box, node, img, display, &local_style);
991
0
  }
992
0
}
993
994
static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
995
0
{
996
0
  const char *src = fz_xml_att(node, "l:href");
997
0
  if (!src)
998
0
    src = fz_xml_att(node, "xlink:href");
999
0
  if (src && src[0] == '#')
1000
0
  {
1001
0
    fz_image *img = fz_tree_lookup(ctx, g->images, src+1);
1002
0
    gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style);
1003
0
  }
1004
0
}
1005
1006
static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
1007
0
{
1008
0
  fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node);
1009
0
  gen2_image_common(ctx, g, root_box, node, img, display, style);
1010
0
}
1011
1012
static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag)
1013
506
{
1014
506
  if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0)
1015
1
  {
1016
1
    switch (tag[1])
1017
1
    {
1018
1
    case '1': return 1;
1019
0
    case '2': return 2;
1020
0
    case '3': return 3;
1021
0
    case '4': return 4;
1022
0
    case '5': return 5;
1023
0
    case '6': return 6;
1024
1
    }
1025
1
  }
1026
505
  if (g->is_fb2)
1027
0
  {
1028
0
    if (!strcmp(tag, "title") || !strcmp(tag, "subtitle"))
1029
0
      return fz_mini(g->section_depth, 6);
1030
0
  }
1031
505
  return 0;
1032
505
}
1033
1034
static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node,
1035
  fz_css_match *match, int display, fz_css_style *style)
1036
616
{
1037
616
  fz_html_box *this_box;
1038
616
  const char *tag;
1039
616
  const char *lang_att;
1040
616
  const char *dir_att;
1041
1042
616
  int save_markup_dir = g->markup_dir;
1043
616
  int save_markup_lang = g->markup_lang;
1044
616
  char *save_href = g->href;
1045
1046
616
  if (display == DIS_NONE)
1047
52
    return;
1048
1049
564
  tag = fz_xml_tag(node);
1050
1051
564
  dir_att = fz_xml_att(node, "dir");
1052
564
  if (dir_att)
1053
55
  {
1054
55
    if (!strcmp(dir_att, "auto"))
1055
0
      g->markup_dir = FZ_BIDI_NEUTRAL;
1056
55
    else if (!strcmp(dir_att, "rtl"))
1057
0
      g->markup_dir = FZ_BIDI_RTL;
1058
55
    else if (!strcmp(dir_att, "ltr"))
1059
55
      g->markup_dir = FZ_BIDI_LTR;
1060
0
    else
1061
0
      g->markup_dir = DEFAULT_DIR;
1062
55
  }
1063
1064
564
  lang_att = fz_xml_att(node, "lang");
1065
564
  if (lang_att)
1066
0
    g->markup_lang = fz_text_language_from_string(lang_att);
1067
1068
564
  switch (display)
1069
564
  {
1070
0
  case DIS_INLINE_BLOCK:
1071
    // TODO handle inline block as a flow node
1072
0
    this_box = gen2_block(ctx, g, root_box, node, style);
1073
0
    break;
1074
1075
506
  case DIS_BLOCK:
1076
506
    this_box = gen2_block(ctx, g, root_box, node, style);
1077
506
    this_box->heading = get_heading_from_tag(ctx, g, tag);
1078
506
    break;
1079
1080
0
  case DIS_LIST_ITEM:
1081
0
    this_box = gen2_block(ctx, g, root_box, node, style);
1082
0
    this_box->list_item = ++g->list_counter;
1083
0
    break;
1084
1085
  // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes
1086
  //
1087
  // The table generation code should insert and create anonymous boxes
1088
  // for any missing child/parent elements.
1089
  //
1090
  // MISSING CHILDREN:
1091
  // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW.
1092
  // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL.
1093
  //
1094
  // MISSING PARENTS:
1095
  // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW
1096
  // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE
1097
  //
1098
  // For now we ignore this and treat any such elements that are out of
1099
  // context as plain block elements.
1100
1101
0
  case DIS_TABLE:
1102
0
    this_box = gen2_table(ctx, g, root_box, node, style);
1103
0
    break;
1104
0
  case DIS_TABLE_GROUP:
1105
    // no box for table-row-group elements
1106
0
    this_box = root_box;
1107
0
    break;
1108
0
  case DIS_TABLE_ROW:
1109
0
    this_box = gen2_table_row(ctx, g, root_box, node, style);
1110
0
    break;
1111
0
  case DIS_TABLE_CELL:
1112
0
    this_box = gen2_table_cell(ctx, g, root_box, node, style);
1113
0
    break;
1114
1115
58
  case DIS_INLINE:
1116
58
  default:
1117
58
    this_box = gen2_inline(ctx, g, root_box, node, style);
1118
58
    break;
1119
564
  }
1120
1121
564
  if (tag && !strcmp(tag, "ol"))
1122
0
  {
1123
0
    int save_list_counter = g->list_counter;
1124
0
    g->list_counter = 0;
1125
0
    gen2_children(ctx, g, this_box, node, match);
1126
0
    g->list_counter = save_list_counter;
1127
0
  }
1128
564
  else if (tag && !strcmp(tag, "section"))
1129
0
  {
1130
0
    int save_section_depth = g->section_depth;
1131
0
    g->section_depth++;
1132
0
    gen2_children(ctx, g, this_box, node, match);
1133
0
    g->section_depth = save_section_depth;
1134
0
  }
1135
564
  else
1136
564
  {
1137
564
    gen2_children(ctx, g, this_box, node, match);
1138
564
  }
1139
1140
564
  g->markup_dir = save_markup_dir;
1141
564
  g->markup_lang = save_markup_lang;
1142
564
  g->href = save_href;
1143
564
}
1144
1145
static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match)
1146
564
{
1147
564
  fz_xml *node;
1148
564
  const char *tag;
1149
564
  fz_css_match match;
1150
564
  fz_css_style style;
1151
564
  int display;
1152
1153
1.75k
  for (node = fz_xml_down(root_node); node; node = fz_xml_next(node))
1154
1.19k
  {
1155
1.19k
    tag = fz_xml_tag(node);
1156
1.19k
    if (tag)
1157
563
    {
1158
563
      fz_match_css(ctx, &match, root_match, g->css, node);
1159
563
      fz_apply_css_style(ctx, g->set, &style, &match);
1160
563
      display = fz_get_css_match_display(&match);
1161
563
      if (tag[0]=='b' && tag[1]=='r' && tag[2]==0)
1162
0
      {
1163
0
        gen2_break(ctx, g, root_box, node);
1164
0
      }
1165
563
      else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0)
1166
0
      {
1167
0
        gen2_image_html(ctx, g, root_box, node, display, &style);
1168
0
      }
1169
563
      else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0)
1170
0
      {
1171
0
        gen2_image_fb2(ctx, g, root_box, node, display, &style);
1172
0
      }
1173
563
      else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0)
1174
0
      {
1175
0
        gen2_image_svg(ctx, g, root_box, node, display, &style);
1176
0
      }
1177
563
      else
1178
563
      {
1179
563
        gen2_tag(ctx, g, root_box, node, &match, display, &style);
1180
563
      }
1181
563
    }
1182
628
    else
1183
628
    {
1184
628
      gen2_text(ctx, g, root_box, node);
1185
628
    }
1186
1.19k
  }
1187
564
}
1188
1189
static char *concat_text(fz_context *ctx, fz_xml *root)
1190
0
{
1191
0
  fz_xml *node;
1192
0
  size_t i = 0, n = 1;
1193
0
  char *s;
1194
0
  for (node = fz_xml_down(root); node; node = fz_xml_next(node))
1195
0
  {
1196
0
    const char *text = fz_xml_text(node);
1197
0
    n += text ? strlen(text) : 0;
1198
0
  }
1199
0
  s = Memento_label(fz_malloc(ctx, n), "concat_html");
1200
0
  for (node = fz_xml_down(root); node; node = fz_xml_next(node))
1201
0
  {
1202
0
    const char *text = fz_xml_text(node);
1203
0
    if (text)
1204
0
    {
1205
0
      n = strlen(text);
1206
0
      memcpy(s+i, text, n);
1207
0
      i += n;
1208
0
    }
1209
0
  }
1210
0
  s[i] = 0;
1211
0
  return s;
1212
0
}
1213
1214
static void
1215
html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href)
1216
0
{
1217
0
  char path[2048];
1218
0
  char css_base_uri[2048];
1219
0
  fz_buffer *buf;
1220
1221
0
  fz_var(buf);
1222
1223
0
  fz_strlcpy(path, base_uri, sizeof path);
1224
0
  fz_strlcat(path, "/", sizeof path);
1225
0
  fz_strlcat(path, href, sizeof path);
1226
0
  fz_urldecode(path);
1227
0
  fz_cleanname(path);
1228
1229
0
  fz_dirname(css_base_uri, path, sizeof css_base_uri);
1230
1231
0
  buf = NULL;
1232
0
  fz_try(ctx)
1233
0
  {
1234
0
    buf = fz_read_archive_entry(ctx, zip, path);
1235
0
    fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path);
1236
0
    fz_add_css_font_faces(ctx, set, zip, css_base_uri, css);
1237
0
  }
1238
0
  fz_always(ctx)
1239
0
    fz_drop_buffer(ctx, buf);
1240
0
  fz_catch(ctx)
1241
0
  {
1242
0
    fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1243
0
    fz_report_error(ctx);
1244
0
    fz_warn(ctx, "ignoring stylesheet %s", path);
1245
0
  }
1246
0
}
1247
1248
static void
1249
html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1250
53
{
1251
53
  fz_xml *html, *head, *node;
1252
1253
53
  html = fz_xml_find(root, "html");
1254
53
  head = fz_xml_find_down(html, "head");
1255
65
  for (node = fz_xml_down(head); node; node = fz_xml_next(node))
1256
12
  {
1257
12
    if (fz_xml_is_tag(node, "link"))
1258
0
    {
1259
0
      char *rel = fz_xml_att(node, "rel");
1260
0
      if (rel && !fz_strcasecmp(rel, "stylesheet"))
1261
0
      {
1262
0
        char *type = fz_xml_att(node, "type");
1263
0
        if ((type && !strcmp(type, "text/css")) || !type)
1264
0
        {
1265
0
          char *href = fz_xml_att(node, "href");
1266
0
          if (href)
1267
0
          {
1268
0
            html_load_css_link(ctx, set, zip, base_uri, css, root, href);
1269
0
          }
1270
0
        }
1271
0
      }
1272
0
    }
1273
12
    else if (fz_xml_is_tag(node, "style"))
1274
0
    {
1275
0
      char *s = concat_text(ctx, node);
1276
0
      fz_try(ctx)
1277
0
      {
1278
0
        fz_parse_css(ctx, css, s, "<style>");
1279
0
        fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1280
0
      }
1281
0
      fz_always(ctx)
1282
0
        fz_free(ctx, s);
1283
0
      fz_catch(ctx)
1284
0
      {
1285
0
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1286
0
        fz_report_error(ctx);
1287
0
        fz_warn(ctx, "ignoring inline stylesheet");
1288
0
      }
1289
0
    }
1290
12
  }
1291
53
}
1292
1293
static void
1294
fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1295
0
{
1296
0
  fz_xml *fictionbook, *stylesheet;
1297
1298
0
  fictionbook = fz_xml_find(root, "FictionBook");
1299
0
  stylesheet = fz_xml_find_down(fictionbook, "stylesheet");
1300
0
  if (stylesheet)
1301
0
  {
1302
0
    char *s = concat_text(ctx, stylesheet);
1303
0
    fz_try(ctx)
1304
0
    {
1305
0
      fz_parse_css(ctx, css, s, "<stylesheet>");
1306
0
      fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1307
0
    }
1308
0
    fz_catch(ctx)
1309
0
    {
1310
0
      fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1311
0
      fz_report_error(ctx);
1312
0
      fz_warn(ctx, "ignoring inline stylesheet");
1313
0
    }
1314
0
    fz_free(ctx, s);
1315
0
  }
1316
0
}
1317
1318
static fz_tree *
1319
load_fb2_images(fz_context *ctx, fz_xml *root)
1320
0
{
1321
0
  fz_xml *fictionbook, *binary;
1322
0
  fz_tree *images = NULL;
1323
1324
0
  fictionbook = fz_xml_find(root, "FictionBook");
1325
0
  for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary"))
1326
0
  {
1327
0
    const char *id = fz_xml_att(binary, "id");
1328
0
    char *b64 = NULL;
1329
0
    fz_buffer *buf = NULL;
1330
0
    fz_image *img = NULL;
1331
1332
0
    fz_var(b64);
1333
0
    fz_var(buf);
1334
1335
0
    if (id == NULL)
1336
0
    {
1337
0
      fz_warn(ctx, "Skipping image with no id");
1338
0
      continue;
1339
0
    }
1340
1341
0
    fz_try(ctx)
1342
0
    {
1343
0
      b64 = concat_text(ctx, binary);
1344
0
      buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64));
1345
0
      img = fz_new_image_from_buffer(ctx, buf);
1346
0
    }
1347
0
    fz_always(ctx)
1348
0
    {
1349
0
      fz_drop_buffer(ctx, buf);
1350
0
      fz_free(ctx, b64);
1351
0
    }
1352
0
    fz_catch(ctx)
1353
0
      fz_rethrow(ctx);
1354
1355
0
    images = fz_tree_insert(ctx, images, id, img);
1356
0
  }
1357
1358
0
  return images;
1359
0
}
1360
1361
typedef struct
1362
{
1363
  uint32_t *data;
1364
  size_t cap;
1365
  size_t len;
1366
} uni_buf;
1367
1368
typedef struct
1369
{
1370
  fz_context *ctx;
1371
  fz_pool *pool;
1372
  fz_html_flow *flow;
1373
  uni_buf *buffer;
1374
} bidi_data;
1375
1376
static void fragment_cb(const uint32_t *fragment,
1377
      size_t fragment_len,
1378
      int bidi_level,
1379
      int script,
1380
      void *arg)
1381
244
{
1382
244
  bidi_data *data = (bidi_data *)arg;
1383
1384
  /* We are guaranteed that fragmentOffset will be at the beginning
1385
   * of flow. */
1386
4.60k
  while (fragment_len > 0)
1387
4.36k
  {
1388
4.36k
    size_t len;
1389
1390
4.36k
    if (data->flow->type == FLOW_SPACE)
1391
1.57k
    {
1392
1.57k
      len = 1;
1393
1.57k
    }
1394
2.78k
    else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK ||
1395
2.78k
        data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR)
1396
484
    {
1397
484
      len = 0;
1398
484
    }
1399
2.30k
    else
1400
2.30k
    {
1401
      /* Must be text */
1402
2.30k
      len = fz_utflen(data->flow->content.text);
1403
2.30k
      if (len > fragment_len)
1404
0
      {
1405
        /* We need to split this flow box */
1406
0
        (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len);
1407
0
        len = fz_utflen(data->flow->content.text);
1408
0
      }
1409
2.30k
    }
1410
1411
    /* This flow box is entirely contained within this fragment. */
1412
4.36k
    data->flow->bidi_level = bidi_level;
1413
4.36k
    data->flow->script = script;
1414
4.36k
    data->flow = data->flow->next;
1415
4.36k
    fragment_len -= len;
1416
4.36k
  }
1417
244
}
1418
1419
static fz_bidi_direction
1420
detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow)
1421
244
{
1422
244
  fz_html_flow *end = flow;
1423
244
  bidi_data data;
1424
1425
488
  while (end)
1426
244
  {
1427
244
    int level = end->bidi_level;
1428
1429
    /* Gather the text from the flow up into a single buffer (at
1430
     * least, as much of it as has the same direction markup). */
1431
244
    buffer->len = 0;
1432
4.60k
    while (end && (level & 1) == (end->bidi_level & 1))
1433
4.36k
    {
1434
4.36k
      size_t len = 0;
1435
4.36k
      const char *text = "";
1436
4.36k
      int broken = 0;
1437
1438
4.36k
      switch (end->type)
1439
4.36k
      {
1440
2.30k
      case FLOW_WORD:
1441
2.30k
        len = fz_utflen(end->content.text);
1442
2.30k
        text = end->content.text;
1443
2.30k
        break;
1444
1.57k
      case FLOW_SPACE:
1445
1.57k
        len = 1;
1446
1.57k
        text = " ";
1447
1.57k
        break;
1448
0
      case FLOW_SHYPHEN:
1449
484
      case FLOW_SBREAK:
1450
484
        break;
1451
0
      case FLOW_BREAK:
1452
0
      case FLOW_IMAGE:
1453
0
        broken = 1;
1454
0
        break;
1455
4.36k
      }
1456
1457
4.36k
      end = end->next;
1458
1459
4.36k
      if (broken)
1460
0
        break;
1461
1462
      /* Make sure the buffer is large enough */
1463
4.36k
      if (buffer->len + len > buffer->cap)
1464
60
      {
1465
60
        size_t newcap = buffer->cap;
1466
60
        if (newcap < 128)
1467
47
          newcap = 128; /* Sensible small default */
1468
1469
73
        while (newcap < buffer->len + len)
1470
13
          newcap = (newcap * 3) / 2;
1471
1472
60
        buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t);
1473
60
        buffer->cap = newcap;
1474
60
      }
1475
1476
      /* Expand the utf8 text into Unicode and store it in the buffer */
1477
19.5k
      while (*text)
1478
15.2k
      {
1479
15.2k
        int rune;
1480
15.2k
        text += fz_chartorune(&rune, text);
1481
15.2k
        buffer->data[buffer->len++] = rune;
1482
15.2k
      }
1483
4.36k
    }
1484
1485
    /* Detect directionality for the buffer */
1486
244
    data.ctx = ctx;
1487
244
    data.pool = pool;
1488
244
    data.flow = flow;
1489
244
    data.buffer = buffer;
1490
244
    fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */);
1491
244
    flow = end;
1492
244
  }
1493
244
  return bidi_dir;
1494
244
}
1495
1496
static void
1497
detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box)
1498
1.10k
{
1499
2.15k
  while (box)
1500
1.04k
  {
1501
1.04k
    if (box->type == BOX_FLOW)
1502
244
      box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head);
1503
1.04k
    detect_box_directionality(ctx, pool, buffer, box->down);
1504
1.04k
    box = box->next;
1505
1.04k
  }
1506
1.10k
}
1507
1508
static void
1509
detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box)
1510
53
{
1511
53
  uni_buf buffer = { NULL };
1512
1513
106
  fz_try(ctx)
1514
106
    detect_box_directionality(ctx, pool, &buffer, box);
1515
106
  fz_always(ctx)
1516
53
    fz_free(ctx, buffer.data);
1517
53
  fz_catch(ctx)
1518
0
    fz_rethrow(ctx);
1519
53
}
1520
1521
static fz_xml_doc *
1522
parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5)
1523
53
{
1524
53
  fz_xml_doc *xml;
1525
1526
53
  if (try_xml && try_html5)
1527
0
  {
1528
0
    fz_try(ctx)
1529
0
      xml = fz_parse_xml(ctx, buf, 1);
1530
0
    fz_catch(ctx)
1531
0
    {
1532
0
      if (fz_caught(ctx) == FZ_ERROR_SYNTAX)
1533
0
      {
1534
0
        fz_report_error(ctx);
1535
0
        fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser");
1536
0
        xml = fz_parse_xml_from_html5(ctx, buf);
1537
0
      }
1538
0
      else
1539
0
        fz_rethrow(ctx);
1540
0
    }
1541
0
  }
1542
53
  else if (try_xml)
1543
0
    xml = fz_parse_xml(ctx, buf, 1);
1544
53
  else
1545
53
  {
1546
53
    assert(try_html5);
1547
53
    xml = fz_parse_xml_from_html5(ctx, buf);
1548
53
  }
1549
1550
53
  return xml;
1551
53
}
1552
1553
static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from)
1554
0
{
1555
0
  fz_css_color transparent = { 0, 0, 0, 0 };
1556
0
  fz_css_style s1, s2;
1557
0
  memcpy(&s1, root->style, sizeof s1);
1558
0
  memcpy(&s2, from->style, sizeof s2);
1559
0
  s1.background_color = s2.background_color;
1560
0
  s2.background_color = transparent;
1561
0
  root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool);
1562
0
  from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool);
1563
0
}
1564
1565
static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root)
1566
53
{
1567
53
  fz_html_box *html, *body;
1568
1569
53
  if (root->style->background_color.a != 0)
1570
0
  {
1571
0
    return;
1572
0
  }
1573
1574
53
  html = root->down;
1575
53
  if (html && !strcmp(html->tag, "html"))
1576
52
  {
1577
52
    if (html->style->background_color.a != 0)
1578
0
    {
1579
0
      move_background_color_style_up(ctx, g, root, html);
1580
0
      return;
1581
0
    }
1582
1583
52
    body = html->down;
1584
52
    if (body && !strcmp(body->tag, "body"))
1585
52
    {
1586
52
      if (body->style->background_color.a != 0)
1587
0
      {
1588
0
        move_background_color_style_up(ctx, g, root, body);
1589
0
        return;
1590
0
      }
1591
52
    }
1592
52
  }
1593
53
}
1594
1595
static void
1596
xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css,
1597
  fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi)
1598
53
{
1599
53
  fz_xml *root, *node;
1600
53
  char *title;
1601
1602
53
  fz_css_match root_match, match;
1603
53
  struct genstate g = {0};
1604
1605
53
  g.pool = NULL;
1606
53
  g.set = set;
1607
53
  g.zip = zip;
1608
53
  g.images = NULL;
1609
53
  g.xml = xml;
1610
53
  g.is_fb2 = 0;
1611
53
  g.base_uri = base_uri;
1612
53
  g.css = NULL;
1613
53
  g.at_bol = 0;
1614
53
  g.emit_white = 0;
1615
53
  g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP;
1616
53
  g.list_counter = 0;
1617
53
  g.section_depth = 0;
1618
53
  g.markup_dir = FZ_BIDI_LTR;
1619
53
  g.markup_lang = FZ_LANG_UNSET;
1620
53
  g.href = NULL;
1621
53
  g.styles = NULL;
1622
1623
53
  if (rtitle)
1624
14
    *rtitle = NULL;
1625
1626
53
  root = fz_xml_root(g.xml);
1627
53
  g.css = fz_new_css(ctx);
1628
1629
53
#ifndef NDEBUG
1630
53
  if (fz_atoi(getenv("FZ_DEBUG_XML")))
1631
0
    fz_debug_xml(root, 0);
1632
53
#endif
1633
1634
106
  fz_try(ctx)
1635
106
  {
1636
53
    if (try_fictionbook && fz_xml_find(root, "FictionBook"))
1637
0
    {
1638
0
      g.is_fb2 = 1;
1639
0
      fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>");
1640
0
      if (fz_use_document_css(ctx))
1641
0
        fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1642
0
      g.images = load_fb2_images(ctx, root);
1643
0
    }
1644
53
    else if (is_mobi)
1645
0
    {
1646
0
      g.is_fb2 = 0;
1647
0
      fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1648
0
      fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>");
1649
0
      if (fz_use_document_css(ctx))
1650
0
        html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1651
0
    }
1652
53
    else
1653
53
    {
1654
53
      g.is_fb2 = 0;
1655
53
      fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1656
53
      if (fz_use_document_css(ctx))
1657
53
        html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1658
53
    }
1659
1660
53
    if (user_css)
1661
39
    {
1662
39
      fz_parse_css(ctx, g.css, user_css, "<user>");
1663
39
      fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css);
1664
39
    }
1665
53
  }
1666
106
  fz_catch(ctx)
1667
1
  {
1668
1
    fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
1669
1
    fz_drop_css(ctx, g.css);
1670
1
    fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1671
1
    fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1672
1
    fz_report_error(ctx);
1673
1
    fz_warn(ctx, "ignoring styles");
1674
1
    g.css = fz_new_css(ctx);
1675
1
    g.images = NULL;
1676
1
  }
1677
1678
53
#ifndef NDEBUG
1679
53
  if (fz_atoi(getenv("FZ_DEBUG_CSS")))
1680
0
    fz_debug_css(ctx, g.css);
1681
53
#endif
1682
1683
106
  fz_try(ctx)
1684
106
  {
1685
53
    fz_css_style style;
1686
53
    int display;
1687
1688
53
    fz_match_css_at_page(ctx, &root_match, g.css);
1689
53
    fz_apply_css_style(ctx, g.set, &style, &root_match);
1690
1691
53
    g.pool = tree->pool;
1692
53
    g.markup_dir = DEFAULT_DIR;
1693
53
    g.markup_lang = FZ_LANG_UNSET;
1694
1695
    // Create root node
1696
53
    tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style);
1697
    // TODO: transfer page margins out of this hacky box
1698
1699
53
    tree->root->tag = ":root";
1700
53
    tree->root->s.layout.em = 0;
1701
53
    tree->root->s.layout.x = 0;
1702
53
    tree->root->s.layout.y = 0;
1703
53
    tree->root->s.layout.w = 0;
1704
53
    tree->root->s.layout.b = 0;
1705
1706
    // Create document node (html).
1707
53
    fz_match_css(ctx, &match, &root_match, g.css, root);
1708
53
    fz_apply_css_style(ctx, g.set, &style, &match);
1709
53
    display = fz_get_css_match_display(&match);
1710
53
    gen2_tag(ctx, &g, tree->root, root, &match, display, &style);
1711
1712
53
    detect_directionality(ctx, g.pool, tree->root);
1713
1714
53
    if (g.is_fb2)
1715
0
    {
1716
0
      node = fz_xml_find(root, "FictionBook");
1717
0
      node = fz_xml_find_down(node, "description");
1718
0
      node = fz_xml_find_down(node, "title-info");
1719
0
      node = fz_xml_find_down(node, "book-title");
1720
0
      if (rtitle)
1721
0
      {
1722
0
        title = fz_xml_text(fz_xml_down(node));
1723
0
        if (title)
1724
0
          *rtitle = fz_pool_strdup(ctx, g.pool, title);
1725
0
      }
1726
0
    }
1727
53
    else
1728
53
    {
1729
53
      node = fz_xml_find(root, "html");
1730
53
      node = fz_xml_find_down(node, "head");
1731
53
      node = fz_xml_find_down(node, "title");
1732
53
      if (rtitle)
1733
14
      {
1734
14
        title = fz_xml_text(fz_xml_down(node));
1735
14
        if (title)
1736
3
          *rtitle = fz_pool_strdup(ctx, g.pool, title);
1737
14
      }
1738
1739
      // Move html or body background-color to :root.
1740
53
      move_background_color_up(ctx, &g, tree->root);
1741
53
    }
1742
53
  }
1743
106
  fz_always(ctx)
1744
53
  {
1745
53
    fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
1746
53
    fz_drop_css(ctx, g.css);
1747
53
  }
1748
53
  fz_catch(ctx)
1749
0
  {
1750
0
    if (rtitle)
1751
0
    {
1752
0
      fz_free(ctx, *rtitle);
1753
0
      *rtitle = NULL;
1754
0
    }
1755
    /* Dropping the tree works regardless of whether the tree is part of an fz_html or not. */
1756
0
    fz_drop_html_tree(ctx, tree);
1757
0
    fz_rethrow(ctx);
1758
0
  }
1759
53
}
1760
1761
static const char *mobi_font_size[7] = {
1762
  "0.67em",
1763
  "0.83em",
1764
  "1em",
1765
  "1.17em",
1766
  "1.33em",
1767
  "1.5em",
1768
  "1.67em",
1769
};
1770
1771
static void
1772
patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node)
1773
0
{
1774
0
  fz_xml *down;
1775
0
  char buf[500];
1776
0
  while (node)
1777
0
  {
1778
0
    char *tag = fz_xml_tag(node);
1779
0
    if (tag)
1780
0
    {
1781
      // Read MOBI attributes, convert to inline CSS style
1782
0
      if (!strcmp(tag, "font"))
1783
0
      {
1784
0
        const char *size = fz_xml_att(node, "size");
1785
0
        if (size)
1786
0
        {
1787
0
          if (!strcmp(size, "1")) size = mobi_font_size[0];
1788
0
          else if (!strcmp(size, "2")) size = mobi_font_size[1];
1789
0
          else if (!strcmp(size, "3")) size = mobi_font_size[2];
1790
0
          else if (!strcmp(size, "4")) size = mobi_font_size[3];
1791
0
          else if (!strcmp(size, "5")) size = mobi_font_size[4];
1792
0
          else if (!strcmp(size, "6")) size = mobi_font_size[5];
1793
0
          else if (!strcmp(size, "7")) size = mobi_font_size[6];
1794
0
          else if (!strcmp(size, "+1")) size = mobi_font_size[3];
1795
0
          else if (!strcmp(size, "+2")) size = mobi_font_size[4];
1796
0
          else if (!strcmp(size, "+3")) size = mobi_font_size[5];
1797
0
          else if (!strcmp(size, "+4")) size = mobi_font_size[6];
1798
0
          else if (!strcmp(size, "+5")) size = mobi_font_size[6];
1799
0
          else if (!strcmp(size, "+6")) size = mobi_font_size[6];
1800
0
          else if (!strcmp(size, "-1")) size = mobi_font_size[1];
1801
0
          else if (!strcmp(size, "-2")) size = mobi_font_size[0];
1802
0
          else if (!strcmp(size, "-3")) size = mobi_font_size[0];
1803
0
          else if (!strcmp(size, "-4")) size = mobi_font_size[0];
1804
0
          else if (!strcmp(size, "-5")) size = mobi_font_size[0];
1805
0
          else if (!strcmp(size, "-6")) size = mobi_font_size[0];
1806
0
          fz_snprintf(buf, sizeof buf, "font-size:%s", size);
1807
0
          fz_xml_add_att(ctx, pool, node, "style", buf);
1808
0
        }
1809
0
      }
1810
0
      else
1811
0
      {
1812
0
        char *height = fz_xml_att(node, "height");
1813
0
        char *width = fz_xml_att(node, "width");
1814
0
        char *align = fz_xml_att(node, "align");
1815
0
        if (height || width || align)
1816
0
        {
1817
0
          buf[0] = 0;
1818
0
          if (height)
1819
0
          {
1820
0
            fz_strlcat(buf, "margin-top:", sizeof buf);
1821
0
            fz_strlcat(buf, height, sizeof buf);
1822
0
            fz_strlcat(buf, ";", sizeof buf);
1823
0
          }
1824
0
          if (width)
1825
0
          {
1826
0
            fz_strlcat(buf, "text-indent:", sizeof buf);
1827
0
            fz_strlcat(buf, width, sizeof buf);
1828
0
            fz_strlcat(buf, ";", sizeof buf);
1829
0
          }
1830
0
          if (align)
1831
0
          {
1832
0
            fz_strlcat(buf, "text-align:", sizeof buf);
1833
0
            fz_strlcat(buf, align, sizeof buf);
1834
0
            fz_strlcat(buf, ";", sizeof buf);
1835
0
          }
1836
0
          fz_xml_add_att(ctx, pool, node, "style", buf);
1837
0
        }
1838
0
        if (!strcmp(tag, "img"))
1839
0
        {
1840
0
          char *recindex = fz_xml_att(node, "recindex");
1841
0
          if (recindex)
1842
0
            fz_xml_add_att(ctx, pool, node, "src", recindex);
1843
0
        }
1844
0
      }
1845
0
    }
1846
1847
0
    down = fz_xml_down(node);
1848
0
    if (down)
1849
0
      patch_mobi_html(ctx, pool, down);
1850
1851
0
    node = fz_xml_next(node);
1852
0
  }
1853
0
}
1854
1855
static void
1856
fz_parse_html_tree(fz_context *ctx,
1857
  fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
1858
  int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi)
1859
14
{
1860
14
  fz_xml_doc *xml;
1861
1862
14
  if (rtitle)
1863
14
    *rtitle = NULL;
1864
1865
14
  xml = parse_to_xml(ctx, buf, try_xml, try_html5);
1866
1867
14
  if (patch_mobi)
1868
0
    patch_mobi_html(ctx, xml->u.doc.pool, xml);
1869
1870
28
  fz_try(ctx)
1871
28
    xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi);
1872
28
  fz_always(ctx)
1873
14
    fz_drop_xml(ctx, xml);
1874
14
  fz_catch(ctx)
1875
0
    fz_rethrow(ctx);
1876
14
}
1877
1878
#define fz_new_derived_html_tree(CTX, TYPE, DROP) \
1879
53
 ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE))
1880
1881
static fz_html_tree *
1882
fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop)
1883
53
{
1884
53
  fz_pool *pool = fz_new_pool(ctx);
1885
53
  fz_html_tree *tree;
1886
1887
106
  fz_try(ctx)
1888
106
  {
1889
53
    tree = fz_pool_alloc(ctx, pool, size);
1890
53
    FZ_INIT_STORABLE(tree, 1, drop);
1891
53
    tree->pool = pool;
1892
53
  }
1893
106
  fz_catch(ctx)
1894
0
  {
1895
0
    fz_drop_pool(ctx, pool);
1896
0
    fz_rethrow(ctx);
1897
0
  }
1898
1899
53
  return tree;
1900
53
}
1901
1902
fz_html *
1903
fz_parse_html(fz_context *ctx,
1904
  fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
1905
  int try_xml, int try_html5, int patch_mobi)
1906
14
{
1907
14
  fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp);
1908
1909
14
  html->layout_w = 0;
1910
14
  html->layout_h = 0;
1911
14
  html->layout_em = 0;
1912
1913
28
  fz_try(ctx)
1914
28
    fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi);
1915
28
  fz_catch(ctx)
1916
0
  {
1917
0
    fz_drop_html(ctx, html);
1918
0
    fz_rethrow(ctx);
1919
0
  }
1920
1921
14
  return html;
1922
14
}
1923
1924
typedef struct
1925
{
1926
  int saved;
1927
  fz_warning_cb *old;
1928
  void *arg;
1929
  fz_buffer *buffer;
1930
  fz_context *ctx;
1931
} warning_save;
1932
1933
static void
1934
warn_to_buffer(void *user, const char *message)
1935
5
{
1936
5
  warning_save *save = (warning_save *)user;
1937
5
  fz_context *ctx = save->ctx;
1938
1939
10
  fz_try(ctx)
1940
10
  {
1941
5
    fz_append_string(ctx, save->buffer, message);
1942
5
    fz_append_byte(ctx, save->buffer, '\n');
1943
5
  }
1944
10
  fz_catch(ctx)
1945
0
  {
1946
    /* Silently swallow the error. */
1947
0
    fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1948
0
    fz_report_error(ctx);
1949
0
  }
1950
5
}
1951
1952
static void
1953
redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save)
1954
78
{
1955
78
  save->saved = 1;
1956
78
  save->old = fz_warning_callback(ctx, &save->arg);
1957
78
  save->buffer = buf;
1958
78
  save->ctx = ctx;
1959
1960
78
  fz_flush_warnings(ctx);
1961
78
  fz_set_warning_callback(ctx, warn_to_buffer, save);
1962
78
}
1963
1964
static void
1965
restore_warnings(fz_context *ctx, warning_save *save)
1966
78
{
1967
78
  if (!save->saved)
1968
0
    return;
1969
1970
78
  fz_flush_warnings(ctx);
1971
78
  fz_set_warning_callback(ctx, save->old, save->arg);
1972
78
}
1973
1974
fz_story *
1975
fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip)
1976
39
{
1977
39
  fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp);
1978
39
  warning_save saved = { 0 };
1979
39
  fz_buffer *local_buffer = NULL;
1980
1981
39
  if (buf == NULL)
1982
0
  {
1983
0
    local_buffer = fz_new_buffer(ctx, 0);
1984
0
    buf = local_buffer;
1985
0
  }
1986
1987
39
  fz_var(local_buffer);
1988
39
  fz_var(saved);
1989
1990
78
  fz_try(ctx)
1991
78
  {
1992
39
    story->zip = fz_keep_archive(ctx, zip);
1993
39
    story->font_set = fz_new_html_font_set(ctx);
1994
39
    story->em = em;
1995
39
    story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL;
1996
39
    story->warnings = fz_new_buffer(ctx, 128);
1997
39
    redirect_warnings_to_buffer(ctx, story->warnings, &saved);
1998
39
    story->dom = parse_to_xml(ctx, buf, 0, 1);
1999
39
  }
2000
78
  fz_always(ctx)
2001
39
  {
2002
39
    restore_warnings(ctx, &saved);
2003
39
    fz_drop_buffer(ctx, local_buffer);
2004
39
  }
2005
39
  fz_catch(ctx)
2006
0
  {
2007
0
    fz_drop_html_tree(ctx, &story->tree);
2008
0
    fz_rethrow(ctx);
2009
0
  }
2010
2011
39
  return story;
2012
39
}
2013
2014
fz_html *
2015
fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
2016
0
{
2017
  /* try as XML first, fall back to HTML5 */
2018
0
  return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0);
2019
0
}
2020
2021
static void indent(int level)
2022
0
{
2023
0
  while (level-- > 0)
2024
0
    putchar('\t');
2025
0
}
2026
2027
static void
2028
fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level)
2029
0
{
2030
0
  fz_html_box *sbox = NULL;
2031
0
  while (flow)
2032
0
  {
2033
0
    if (flow->box != sbox) {
2034
0
      sbox = flow->box;
2035
0
      indent(level);
2036
0
#ifndef NDEBUG
2037
0
      printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
2038
#else
2039
      printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
2040
#endif
2041
0
      if (fz_font_is_serif(ctx, sbox->style->font))
2042
0
        printf(" serif");
2043
0
      else
2044
0
        printf(" sans");
2045
0
      if (fz_font_is_monospaced(ctx, sbox->style->font))
2046
0
        printf(" monospaced");
2047
0
      if (fz_font_is_bold(ctx, sbox->style->font))
2048
0
        printf(" bold");
2049
0
      if (fz_font_is_italic(ctx, sbox->style->font))
2050
0
        printf(" italic");
2051
0
      if (sbox->style->small_caps)
2052
0
        printf(" small-caps");
2053
0
      printf("\n");
2054
0
    }
2055
2056
0
    indent(level);
2057
0
    switch (flow->type) {
2058
0
    case FLOW_WORD: printf("word "); break;
2059
0
    case FLOW_SPACE: printf("space"); break;
2060
0
    case FLOW_SBREAK: printf("sbrk "); break;
2061
0
    case FLOW_SHYPHEN: printf("shy  "); break;
2062
0
    case FLOW_BREAK: printf("break"); break;
2063
0
    case FLOW_IMAGE: printf("image"); break;
2064
0
    case FLOW_ANCHOR: printf("anchor"); break;
2065
0
    }
2066
    // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w);
2067
0
    if (flow->type == FLOW_IMAGE)
2068
0
      printf(" h=%g", flow->h);
2069
0
    if (flow->type == FLOW_WORD)
2070
0
      printf(" text='%s'", flow->content.text);
2071
0
    printf("\n");
2072
0
    if (flow->breaks_line) {
2073
0
      indent(level);
2074
0
      printf("*\n");
2075
0
    }
2076
2077
0
    flow = flow->next;
2078
0
  }
2079
0
}
2080
2081
fz_structure fz_html_tag_to_structure(const char *tag)
2082
1.58k
{
2083
1.58k
  if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT;
2084
1.52k
  if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV;
2085
1.51k
  if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN;
2086
1.51k
  if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE;
2087
1.51k
  if (!strcmp(tag, "p")) return FZ_STRUCTURE_P;
2088
279
  if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1;
2089
276
  if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2;
2090
276
  if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3;
2091
276
  if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4;
2092
276
  if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5;
2093
276
  if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6;
2094
276
  if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST;
2095
276
  if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST;
2096
276
  if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST;
2097
276
  if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM;
2098
276
  if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE;
2099
276
  if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR;
2100
276
  if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH;
2101
276
  if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD;
2102
276
  if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD;
2103
276
  if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY;
2104
276
  if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT;
2105
276
  return FZ_STRUCTURE_INVALID;
2106
276
}
2107
2108
static void
2109
fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level)
2110
0
{
2111
0
  while (box)
2112
0
  {
2113
0
    indent(level);
2114
0
    printf("box ");
2115
0
    switch (box->type) {
2116
0
    case BOX_BLOCK: printf("block"); break;
2117
0
    case BOX_FLOW: printf("flow"); break;
2118
0
    case BOX_INLINE: printf("inline"); break;
2119
0
    case BOX_TABLE: printf("table"); break;
2120
0
    case BOX_TABLE_ROW: printf("table-row"); break;
2121
0
    case BOX_TABLE_CELL: printf("table-cell"); break;
2122
0
    }
2123
2124
0
    printf(" <%s>", box->tag);
2125
    // printf(" em=%g", box->em);
2126
    // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b);
2127
2128
0
    if (box->is_first_flow)
2129
0
      printf(" is-first-flow");
2130
0
    if (box->list_item)
2131
0
      printf(" list=%d", box->list_item);
2132
0
    if (box->id)
2133
0
      printf(" id=(%s)", box->id);
2134
0
    if (box->href)
2135
0
      printf(" href=(%s)", box->href);
2136
0
    printf("\n");
2137
2138
0
    if (box->type == BOX_BLOCK || box->type == BOX_TABLE) {
2139
0
      indent(level+1);
2140
0
      printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]);
2141
      //indent(level+1);
2142
      //printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]);
2143
      //indent(level+1);
2144
      //printf(">border=(%g %g %g %g)\n", box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3]);
2145
0
    }
2146
2147
0
    if (box->down)
2148
0
      fz_debug_html_box(ctx, box->down, level + 1);
2149
0
    if (box->type == BOX_FLOW) {
2150
0
      indent(level+1);
2151
0
      printf("flow\n");
2152
0
      fz_debug_html_flow(ctx, box->u.flow.head, level + 2);
2153
0
    }
2154
2155
0
    box = box->next;
2156
0
  }
2157
0
}
2158
2159
void
2160
fz_debug_html(fz_context *ctx, fz_html_box *box)
2161
0
{
2162
0
  fz_debug_html_box(ctx, box, 0);
2163
0
}
2164
2165
static size_t
2166
fz_html_size(fz_context *ctx, fz_html *html)
2167
0
{
2168
0
  return html ? fz_pool_size(ctx, html->tree.pool) : 0;
2169
0
}
2170
2171
/* Magic to make html storable. */
2172
typedef struct {
2173
  int refs;
2174
  void *doc;
2175
  int chapter_num;
2176
} fz_html_key;
2177
2178
static int
2179
fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_)
2180
0
{
2181
0
  fz_html_key *key = (fz_html_key *)key_;
2182
0
  hash->u.pi.ptr = key->doc;
2183
0
  hash->u.pi.i = key->chapter_num;
2184
0
  return 1;
2185
0
}
2186
2187
static void *
2188
fz_keep_html_key(fz_context *ctx, void *key_)
2189
0
{
2190
0
  fz_html_key *key = (fz_html_key *)key_;
2191
0
  return fz_keep_imp(ctx, key, &key->refs);
2192
0
}
2193
2194
static void
2195
fz_drop_html_key(fz_context *ctx, void *key_)
2196
0
{
2197
0
  fz_html_key *key = (fz_html_key *)key_;
2198
0
  if (fz_drop_imp(ctx, key, &key->refs))
2199
0
  {
2200
0
    fz_free(ctx, key);
2201
0
  }
2202
0
}
2203
2204
static int
2205
fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_)
2206
0
{
2207
0
  fz_html_key *k0 = (fz_html_key *)k0_;
2208
0
  fz_html_key *k1 = (fz_html_key *)k1_;
2209
0
  return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num;
2210
0
}
2211
2212
static void
2213
fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_)
2214
0
{
2215
0
  fz_html_key *key = (fz_html_key *)key_;
2216
0
  fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num);
2217
0
}
2218
2219
static const fz_store_type fz_html_store_type =
2220
{
2221
  "fz_html",
2222
  fz_make_hash_html_key,
2223
  fz_keep_html_key,
2224
  fz_drop_html_key,
2225
  fz_cmp_html_key,
2226
  fz_format_html_key,
2227
  NULL
2228
};
2229
2230
fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter)
2231
0
{
2232
0
  fz_html_key *key = NULL;
2233
0
  fz_html *other_html;
2234
2235
  /* Stick the parsed html in the store */
2236
0
  fz_var(key);
2237
2238
0
  fz_try(ctx)
2239
0
  {
2240
0
    key = fz_malloc_struct(ctx, fz_html_key);
2241
0
    key->refs = 1;
2242
0
    key->doc = doc;
2243
0
    key->chapter_num = chapter;
2244
0
    other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type);
2245
0
    if (other_html)
2246
0
    {
2247
0
      fz_drop_html(ctx, html);
2248
0
      html = other_html;
2249
0
    }
2250
0
  }
2251
0
  fz_always(ctx)
2252
0
    fz_drop_html_key(ctx, key);
2253
0
  fz_catch(ctx)
2254
0
  {
2255
    /* Do nothing */
2256
0
  }
2257
2258
0
  return html;
2259
0
}
2260
2261
fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter)
2262
0
{
2263
0
  fz_html_key key;
2264
2265
0
  key.refs = 1;
2266
0
  key.doc = doc;
2267
0
  key.chapter_num = chapter;
2268
0
  return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type);
2269
0
}
2270
2271
static int
2272
html_filter_store(fz_context *ctx, void *doc, void *key_)
2273
0
{
2274
0
  fz_html_key *key = (fz_html_key *)key_;
2275
2276
0
  return (doc == key->doc);
2277
0
}
2278
2279
void fz_purge_stored_html(fz_context *ctx, void *doc)
2280
0
{
2281
0
  fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type);
2282
0
}
2283
2284
static void
2285
convert_to_boxes(fz_context *ctx, fz_story *story)
2286
39
{
2287
39
  warning_save saved = { 0 };
2288
2289
39
  if (story->dom == NULL)
2290
0
    return;
2291
2292
39
  fz_var(saved);
2293
2294
78
  fz_try(ctx)
2295
78
  {
2296
39
    redirect_warnings_to_buffer(ctx, story->warnings, &saved);
2297
39
    xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0);
2298
39
  }
2299
78
  fz_always(ctx)
2300
39
  {
2301
39
    fz_drop_xml(ctx, story->dom);
2302
39
    story->dom = NULL;
2303
39
    restore_warnings(ctx, &saved);
2304
39
  }
2305
39
  fz_catch(ctx)
2306
0
    fz_rethrow(ctx);
2307
39
}
2308
2309
int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled)
2310
39
{
2311
39
  return fz_place_story_flags(ctx, story, where, filled, 0);
2312
39
}
2313
2314
int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags)
2315
39
{
2316
39
  float w, h;
2317
2318
39
  if (filled)
2319
0
    *filled = fz_empty_rect;
2320
2321
39
  if (story == NULL || story->complete)
2322
0
    return 0;
2323
2324
  /* Convert from XML to box model on the first attempt to place.
2325
   * The DOM is unusable from here on in. */
2326
39
  convert_to_boxes(ctx, story);
2327
2328
39
  w = where.x1 - where.x0;
2329
39
  h = where.y1 - where.y0;
2330
  /* Confusingly, we call the layout using restart_draw, not restart_place,
2331
   * because we don't want to destroy the current values in restart_place
2332
   * in case we have to retry later. This means the values are left in
2333
   * the correct struct though! */
2334
39
  story->restart_draw.start = story->restart_place.start;
2335
39
  story->restart_draw.start_flow = story->restart_place.start_flow;
2336
39
  story->restart_draw.end = NULL;
2337
39
  story->restart_draw.end_flow = NULL;
2338
39
  story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE;
2339
39
  story->restart_draw.flags = flags;
2340
39
  story->bbox = where;
2341
39
  fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw);
2342
39
  story->restart_draw.start = story->restart_place.start;
2343
39
  story->restart_draw.start_flow = story->restart_place.start_flow;
2344
2345
39
  if (filled)
2346
0
  {
2347
0
    fz_html_box *b = story->tree.root;
2348
0
    filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L];
2349
0
    filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x;
2350
0
    filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T];
2351
0
    filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B];
2352
0
  }
2353
2354
39
#ifndef NDEBUG
2355
39
  if (fz_atoi(getenv("FZ_DEBUG_HTML")))
2356
0
    fz_debug_html(ctx, story->tree.root);
2357
39
#endif
2358
2359
39
  if (story->restart_draw.end == NULL)
2360
39
    return FZ_HTML_RESTART_REASON_NONE;
2361
0
  if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH)
2362
0
    return FZ_HTML_RESTART_REASON_LINE_WIDTH;
2363
0
  return FZ_HTML_RESTART_REASON_LINE_HEIGHT;
2364
0
}
2365
2366
const char *
2367
fz_story_warnings(fz_context *ctx, fz_story *story)
2368
0
{
2369
0
  unsigned char *data;
2370
2371
0
  if (!story)
2372
0
    return NULL;
2373
2374
0
  convert_to_boxes(ctx, story);
2375
2376
0
  fz_terminate_buffer(ctx, story->warnings);
2377
2378
0
  if (fz_buffer_storage(ctx, story->warnings, &data) == 0)
2379
0
    return NULL;
2380
2381
0
  return (const char *)data;
2382
0
}