Coverage Report

Created: 2025-12-03 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/mupdf/source/html/html-parse.c
Line
Count
Source
1
// Copyright (C) 2004-2025 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "mupdf/fitz.h"
24
#include "mupdf/ucdn.h"
25
#include "html-imp.h"
26
27
#include <string.h>
28
#include <stdio.h>
29
#include <assert.h>
30
31
enum { T, R, B, L };
32
33
static const char *html_default_css =
34
"@page{margin:3em 2em}"
35
"a:link{color:blue;text-decoration:underline}"
36
"address{display:block;font-style:italic}"
37
"b{font-weight:bold}"
38
"bdo{direction:rtl;unicode-bidi:bidi-override}"
39
"blockquote{display:block;margin:1em 40px}"
40
"body{display:block;margin:1em}"
41
"cite{font-style:italic}"
42
"code{font-family:monospace}"
43
"dd{display:block;margin:0 0 0 40px}"
44
"del{text-decoration:line-through}"
45
"div{display:block}"
46
"dl{display:block;margin:1em 0}"
47
"dt{display:block}"
48
"em{font-style:italic}"
49
"h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}"
50
"h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}"
51
"h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}"
52
"h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}"
53
"h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}"
54
"h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}"
55
"head{display:none}"
56
"hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}"
57
"html{display:block}"
58
"i{font-style:italic}"
59
"ins{text-decoration:underline}"
60
"kbd{font-family:monospace}"
61
"li{display:list-item}"
62
"menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
63
"ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}"
64
"p{display:block;margin:1em 0}"
65
"pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}"
66
"samp{font-family:monospace}"
67
"script{display:none}"
68
"small{font-size:0.83em}"
69
"strong{font-weight:bold}"
70
"style{display:none}"
71
"sub{font-size:0.83em;vertical-align:sub}"
72
"sup{font-size:0.83em;vertical-align:super}"
73
"table{display:table;border-spacing:2px}"
74
"tbody{display:table-row-group}"
75
"td{display:table-cell;padding:1px;background-color:inherit}"
76
"tfoot{display:table-footer-group}"
77
"th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}"
78
"thead{display:table-header-group}"
79
"tr{display:table-row}"
80
"ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
81
"ul ul{list-style-type:circle}"
82
"ul ul ul{list-style-type:square}"
83
"var{font-style:italic}"
84
"colgroup{display:table-column-group}"
85
"col{display:table-column}"
86
"figcaption,caption{display:block;text-align:center}"
87
"address,article,aside,figure,footer,header,hgroup,main,nav,section,search{display:block}"
88
;
89
90
static const char *mobi_default_css =
91
"pagebreak{display:block;page-break-before:always}"
92
"dl,ol,ul{margin:0}"
93
"p{margin:0}"
94
"blockquote{margin:0 40px}"
95
"center{display:block;text-align:center}"
96
"big{font-size:1.17em}"
97
"strike{text-decoration:line-through}"
98
;
99
100
static const char *fb2_default_css =
101
"@page{margin:3em 2em}"
102
"FictionBook{display:block;margin:1em}"
103
"stylesheet,binary{display:none}"
104
"description>*{display:none}"
105
"description>title-info{display:block}"
106
"description>title-info>*{display:none}"
107
"description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}"
108
"body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}"
109
"image{display:block}"
110
"p>image{display:inline}"
111
"table{display:table}"
112
"tr{display:table-row}"
113
"th,td{display:table-cell}"
114
"a{color:blue;text-decoration:underline}"
115
"a[type=note]{font-size:small;vertical-align:super}"
116
"code{white-space:pre;font-family:monospace}"
117
"emphasis{font-style:italic}"
118
"strikethrough{text-decoration:line-through}"
119
"strong{font-weight:bold}"
120
"sub{font-size:small;vertical-align:sub}"
121
"sup{font-size:small;vertical-align:super}"
122
"image{margin:1em 0;text-align:center}"
123
"cite,poem{margin:1em 2em}"
124
"subtitle,epigraph,stanza{margin:1em 0}"
125
"title>p{text-align:center;font-size:x-large}"
126
"subtitle{text-align:center;font-size:large}"
127
"p{margin-top:1em;text-align:justify}"
128
"empty-line{padding-top:1em}"
129
"p+p{margin-top:0;text-indent:1.5em}"
130
"empty-line+p{margin-top:0}"
131
"section>title{page-break-before:always}"
132
;
133
134
static const char *known_html_tags[] = {
135
  // TODO: add known FB2 tags?
136
  // Sorted list of all HTML tags.
137
  "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area",
138
  "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo",
139
  "bgsound", "big", "blink", "blockquote", "body", "br", "button",
140
  "canvas", "caption", "center", "cite", "code", "col", "colgroup",
141
  "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir",
142
  "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure",
143
  "font", "footer", "foreignobject", "form", "frame", "frameset", "h1",
144
  "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html",
145
  "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd",
146
  "keygen", "label", "legend", "li", "link", "listing", "main",
147
  "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem",
148
  "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol",
149
  "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object",
150
  "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre",
151
  "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp",
152
  "script", "section", "select", "small", "source", "spacer", "span",
153
  "strike", "strong", "style", "sub", "summary", "sup", "svg", "table",
154
  "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time",
155
  "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp",
156
};
157
158
static const char *known_fb2_tags[] = {
159
  "FictionBook", "a", "binary", "body", "cite", "code", "coverpage",
160
  "date", "description", "emphasis", "empty-line", "epigraph", "image",
161
  "p", "poem", "section", "stanza", "strikethrough", "strong",
162
  "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author",
163
  "th", "title", "title-info", "tr", "v",
164
};
165
166
static const char *find_known_html_tag(const char *tag)
167
0
{
168
0
  int l = 0;
169
0
  int r = nelem(known_html_tags) / 2 - 1;
170
0
  while (l <= r)
171
0
  {
172
0
    int m = (l + r) >> 1;
173
0
    int c = strcmp(tag, known_html_tags[m]);
174
0
    if (c < 0)
175
0
      r = m - 1;
176
0
    else if (c > 0)
177
0
      l = m + 1;
178
0
    else
179
0
      return known_html_tags[m];
180
0
  }
181
0
  return NULL;
182
0
}
183
184
static const char *find_known_fb2_tag(const char *tag)
185
0
{
186
0
  int l = 0;
187
0
  int r = nelem(known_fb2_tags) / 2 - 1;
188
0
  while (l <= r)
189
0
  {
190
0
    int m = (l + r) >> 1;
191
0
    int c = strcmp(tag, known_fb2_tags[m]);
192
0
    if (c < 0)
193
0
      r = m - 1;
194
0
    else if (c > 0)
195
0
      l = m + 1;
196
0
    else
197
0
      return known_fb2_tags[m];
198
0
  }
199
0
  return NULL;
200
0
}
201
202
typedef struct
203
{
204
  int maxcols;
205
  int ncols;
206
  col_style *styles;
207
}
208
table_styles;
209
210
static void
211
drop_table_styles(fz_context *ctx, table_styles *ts)
212
0
{
213
0
  fz_free(ctx, ts->styles);
214
0
  ts->styles = NULL;
215
0
}
216
217
struct genstate
218
{
219
  fz_pool *pool;
220
  fz_html_font_set *set;
221
  fz_archive *zip;
222
  fz_tree *images;
223
  fz_xml_doc *xml;
224
  int is_fb2;
225
  const char *base_uri;
226
  fz_css *css;
227
  int at_bol;
228
  fz_html_box *emit_white;
229
  int last_brk_cls;
230
231
  int list_counter;
232
  int section_depth;
233
  fz_bidi_direction markup_dir;
234
  fz_text_language markup_lang;
235
  char *href;
236
237
  table_styles tab_styles;
238
  int col_num;
239
240
  fz_css_style_splay *styles;
241
};
242
243
static int iswhite(int c)
244
0
{
245
0
  return c == ' ' || c == '\t' || c == '\r' || c == '\n';
246
0
}
247
248
static int is_all_white(const char *s)
249
0
{
250
0
  while (*s)
251
0
  {
252
0
    if (!iswhite(*s))
253
0
      return 0;
254
0
    ++s;
255
0
  }
256
0
  return 1;
257
0
}
258
259
/* TODO: pool allocator for flow nodes */
260
/* TODO: store text by pointing to a giant buffer */
261
262
static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow)
263
0
{
264
0
  while (flow)
265
0
  {
266
0
    fz_html_flow *next = flow->next;
267
0
    if (flow->type == FLOW_IMAGE)
268
0
      fz_drop_image(ctx, flow->content.image);
269
0
    flow = next;
270
0
  }
271
0
}
272
273
static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras)
274
0
{
275
0
  size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras);
276
0
  fz_html_flow *flow;
277
278
  /* Shouldn't happen, but bug 705324. */
279
0
  if (top == NULL || top->type != BOX_FLOW)
280
0
    return NULL;
281
282
0
  flow = fz_pool_alloc(ctx, pool, size);
283
0
  flow->type = type;
284
0
  flow->expand = 0;
285
0
  flow->bidi_level = 0;
286
0
  flow->markup_lang = 0;
287
0
  flow->breaks_line = 0;
288
0
  flow->box = inline_box;
289
0
  (*top->s.build.flow_tail) = flow;
290
0
  top->s.build.flow_tail = &flow->next;
291
0
  return flow;
292
0
}
293
294
static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
295
0
{
296
0
  fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0);
297
0
  if (flow)
298
0
    flow->expand = 1;
299
0
}
300
301
static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
302
0
{
303
0
  (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0);
304
0
}
305
306
static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
307
0
{
308
0
  (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0);
309
0
}
310
311
static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
312
0
{
313
0
  (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0);
314
0
}
315
316
static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang)
317
0
{
318
0
  fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1);
319
0
  if (flow == NULL)
320
0
    return;
321
0
  memcpy(flow->content.text, a, b - a);
322
0
  flow->content.text[b - a] = 0;
323
0
  flow->markup_lang = lang;
324
0
}
325
326
static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img)
327
0
{
328
0
  fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0);
329
0
  if (flow)
330
0
    flow->content.image = fz_keep_image(ctx, img);
331
0
}
332
333
static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
334
0
{
335
0
  (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0);
336
0
}
337
338
fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
339
0
{
340
0
  fz_html_flow *new_flow;
341
0
  char *text;
342
0
  size_t len;
343
344
0
  assert(flow->type == FLOW_WORD);
345
346
0
  if (offset == 0)
347
0
    return flow;
348
0
  text = flow->content.text;
349
0
  while (*text && offset)
350
0
  {
351
0
    int rune;
352
0
    text += fz_chartorune(&rune, text);
353
0
    offset--;
354
0
  }
355
0
  len = strlen(text);
356
0
  new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1);
357
0
  memcpy(new_flow, flow, offsetof(fz_html_flow, content));
358
0
  new_flow->next = flow->next;
359
0
  flow->next = new_flow;
360
0
  strcpy(new_flow->content.text, text);
361
0
  *text = 0;
362
0
  return new_flow;
363
0
}
364
365
static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g)
366
0
{
367
0
  static const char *space = " ";
368
0
  fz_pool *pool = g->pool;
369
0
  if (g->emit_white)
370
0
  {
371
0
    int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE;
372
0
    if (!g->at_bol)
373
0
    {
374
0
      if (bsp)
375
0
        add_flow_space(ctx, pool, flow, g->emit_white);
376
0
      else
377
0
        add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang);
378
0
    }
379
0
    g->emit_white = 0;
380
0
  }
381
0
}
382
383
/* pair-wise lookup table for UAX#14 linebreaks
384
The linebreak table entries mean:
385
^ prohibited break
386
  never break before A and after B, even with one or more spaces in between
387
% indirect break
388
  do not break before A, unless one or more spaces follow B
389
_ direct break
390
  break allowed before A
391
*/
392
static const char *pairbrk[32] =
393
{
394
/*  -OCCQGNESIPPNAHIIHBBBZCWHHJJJREEZ- */
395
/*  -PLPULSXYSROULLDNYAB2WMJ23LVTIBMW- */
396
/*  -                               J- */
397
  "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
398
  "_^^%%^^^^%%____%%%__^^^________%", /* CL close punctuation */
399
  "_^^%%^^^^%%%%%_%%%__^^^________%", /* CP close parenthesis */
400
  "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* QU quotation */
401
  "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* GL non-breaking glue */
402
  "_^^%%%^^^______%%%__^^^________%", /* NS nonstarters */
403
  "_^^%%%^^^______%%%__^^^________%", /* EX exclamation/interrogation */
404
  "_^^%%%^^^__%_%_%%%__^^^________%", /* SY symbols allowing break after */
405
  "_^^%%%^^^__%%%_%%%__^^^________%", /* IS infix numeric separator */
406
  "%^^%%%^^^__%%%%%%%__^^^%%%%%_%%%", /* PR prefix numeric */
407
  "%^^%%%^^^__%%%_%%%__^^^________%", /* PO postfix numeric */
408
  "%^^%%%^^^%%%%%_%%%__^^^________%", /* NU numeric */
409
  "%^^%%%^^^%%%%%_%%%__^^^________%", /* AL ordinary alphabetic and symbol characters */
410
  "%^^%%%^^^%%%%%_%%%__^^^________%", /* HL hebrew letter */
411
  "_^^%%%^^^_%____%%%__^^^________%", /* ID ideographic */
412
  "_^^%%%^^^______%%%__^^^________%", /* IN inseparable characters */
413
  "_^^%_%^^^__%___%%%__^^^________%", /* HY hyphens */
414
  "_^^%_%^^^______%%%__^^^________%", /* BA break after */
415
  "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* BB break before */
416
  "_^^%%%^^^______%%%_^^^^________%", /* B2 break opportunity before and after */
417
  "____________________^___________", /* ZW zero width space */
418
  "%^^%%%^^^%_%%%_%%%__^^^________%", /* CM combining mark */
419
  "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* WJ word joiner */
420
  "_^^%%%^^^_%____%%%__^^^___%%___%", /* H2 hangul leading/vowel syllable */
421
  "_^^%%%^^^_%____%%%__^^^____%___%", /* H3 hangul leading/vowel/trailing syllable */
422
  "_^^%%%^^^_%____%%%__^^^%%%%____%", /* JL hangul leading jamo */
423
  "_^^%%%^^^_%____%%%__^^^___%%___%", /* JV hangul vowel jamo */
424
  "_^^%%%^^^_%____%%%__^^^____%___%", /* JT hangul trailing jamo */
425
  "_^^%%%^^^______%%%__^^^_____%__%", /* RI regional indicator */
426
  "_^^%%%^^^_%____%%%__^^^_______%%", /* EB emoji base */
427
  "_^^%%%^^^_%____%%%__^^^________%", /* EM emoji modifier */
428
  "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%%%%", /* ZWJ zero width joiner */
429
};
430
431
static fz_html_box *
432
find_flow_encloser(fz_context *ctx, fz_html_box *flow)
433
0
{
434
  /* This code was written to assume that there will always be a
435
   * flow box enclosing callers of this. Bug 705324 shows that
436
   * this isn't always the case. In the absence of a reproducer
437
   * file, all I can do is try to patch around the issue so that
438
   * we won't crash. */
439
0
  while (flow->type != BOX_FLOW)
440
0
  {
441
0
    if (flow->up == NULL)
442
0
    {
443
0
      fz_warn(ctx, "Flow encloser not found. Please report this file!");
444
0
      break;
445
0
    }
446
0
    flow = flow->up;
447
0
  }
448
0
  return flow;
449
0
}
450
451
static void
452
generate_text_run(fz_context *ctx, fz_html_box *box, fz_html_box *flow, const char *mark, const char *end, int lang, struct genstate *g)
453
0
{
454
0
  fz_pool *pool = g->pool;
455
0
  int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
456
0
  const char *text = mark;
457
0
  const char *prev;
458
0
  int c;
459
460
0
  while (text < end)
461
0
  {
462
0
    prev = text;
463
0
    text += fz_chartorune(&c, text);
464
0
    if (c == 0xAD) /* soft hyphen */
465
0
    {
466
0
      if (mark != prev)
467
0
        add_flow_word(ctx, pool, flow, box, mark, prev, lang);
468
0
      if (box->style->hyphens != HYP_NONE)
469
0
        add_flow_shyphen(ctx, pool, flow, box);
470
0
      mark = text;
471
0
      g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
472
0
    }
473
0
    else if (bsp) /* allow soft breaks */
474
0
    {
475
0
      int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
476
0
      if (this_brk_cls <= UCDN_LINEBREAK_CLASS_ZWJ)
477
0
      {
478
0
        int brk = pairbrk[g->last_brk_cls][this_brk_cls];
479
480
        /* we handle spaces elsewhere, so ignore these classes */
481
0
        if (brk == '@') brk = '^';
482
0
        if (brk == '#') brk = '^';
483
0
        if (brk == '%') brk = '^';
484
485
0
        if (brk == '_')
486
0
        {
487
0
          if (mark != prev)
488
0
            add_flow_word(ctx, pool, flow, box, mark, prev, lang);
489
0
          add_flow_sbreak(ctx, pool, flow, box);
490
0
          mark = prev;
491
0
        }
492
493
0
        g->last_brk_cls = this_brk_cls;
494
0
      }
495
0
    }
496
0
  }
497
0
  if (mark != text)
498
0
    add_flow_word(ctx, pool, flow, box, mark, text, lang);
499
0
}
500
501
static void
502
generate_text_run_with_hyphens(fz_context *ctx, fz_html_box *box, fz_html_box *flow, const char *mark, const char *end, int lang, fz_hyphenator *hyph, struct genstate *g)
503
0
{
504
0
  char word[256];
505
0
  int size = end - mark;
506
0
  if (size < 64)
507
0
  {
508
0
    fz_hyphenate_word(ctx, hyph, mark, size, word, sizeof word);
509
0
    generate_text_run(ctx, box, flow, word, word + strlen(word), lang, g);
510
0
  }
511
0
  else
512
0
  {
513
0
    generate_text_run(ctx, box, flow, mark, end, lang, g);
514
0
  }
515
0
}
516
517
static int fz_isletter_or_apos(int c)
518
0
{
519
0
  int cat;
520
0
  if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '\'' || c == 0x2019)
521
0
    return 1;
522
0
  cat = ucdn_get_general_category(c);
523
0
  return cat >= UCDN_GENERAL_CATEGORY_LL && cat <= UCDN_GENERAL_CATEGORY_LU;
524
0
}
525
526
static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g)
527
0
{
528
0
  fz_html_box *flow;
529
0
  fz_pool *pool = g->pool;
530
0
  int collapse = box->style->white_space & WS_COLLAPSE;
531
0
  int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
532
0
  int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE;
533
0
  fz_hyphenator *hyph = NULL;
534
0
  int c, n;
535
536
0
  static const char *space = " ";
537
538
0
  flow = find_flow_encloser(ctx, box);
539
0
  if (flow == NULL)
540
0
    return;
541
542
0
  if (box->style->hyphens == HYP_AUTO && lang != FZ_LANG_UNSET)
543
0
  {
544
0
    hyph = fz_lookup_hyphenator(ctx, lang);
545
0
    if (!hyph)
546
0
    {
547
0
      char tmp[8];
548
0
      fz_warn(ctx, "no hyphenation table for lang='%s'", fz_string_from_text_language(tmp, lang));
549
0
    }
550
0
  }
551
552
0
  while (*text)
553
0
  {
554
0
    if (bnl && (*text == '\n' || *text == '\r'))
555
0
    {
556
0
      if (text[0] == '\r' && text[1] == '\n')
557
0
        text += 2;
558
0
      else
559
0
        text += 1;
560
0
      add_flow_break(ctx, pool, flow, box);
561
0
      g->at_bol = 1;
562
0
    }
563
0
    else if (iswhite(*text))
564
0
    {
565
0
      if (collapse)
566
0
      {
567
0
        if (bnl)
568
0
          while (*text == ' ' || *text == '\t')
569
0
            ++text;
570
0
        else
571
0
          while (iswhite(*text))
572
0
            ++text;
573
0
        g->emit_white = box;
574
0
      }
575
0
      else
576
0
      {
577
        // TODO: tabs
578
0
        if (bsp)
579
0
          add_flow_space(ctx, pool, flow, box);
580
0
        else
581
0
          add_flow_word(ctx, pool, flow, box, space, space+1, lang);
582
0
        ++text;
583
0
      }
584
0
      g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
585
0
    }
586
0
    else
587
0
    {
588
0
      const char *mark = text;
589
590
0
      flush_space(ctx, flow, lang, g);
591
592
0
      if (g->at_bol)
593
0
        g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
594
595
0
      while (*text && !iswhite(*text))
596
0
        ++text;
597
598
0
      if (hyph)
599
0
      {
600
        // split word into letter and non-letter runs for hyphenator
601
0
        const char *p = mark;
602
0
        n = fz_chartorune(&c, p);
603
0
        while (p < text)
604
0
        {
605
0
          p += n;
606
0
          if (fz_isletter_or_apos(c))
607
0
          {
608
0
            while (p < text)
609
0
            {
610
0
              n = fz_chartorune(&c, p);
611
0
              if (!fz_isletter_or_apos(c))
612
0
                break;
613
0
              p += n;
614
0
            }
615
0
            generate_text_run_with_hyphens(ctx, box, flow, mark, p, lang, hyph, g);
616
0
          }
617
0
          else
618
0
          {
619
0
            while (p < text)
620
0
            {
621
0
              n = fz_chartorune(&c, p);
622
0
              if (fz_isletter_or_apos(c))
623
0
                break;
624
0
              p += n;
625
0
            }
626
0
            generate_text_run(ctx, box, flow, mark, p, lang, g);
627
0
          }
628
0
          mark = p;
629
0
        }
630
0
      }
631
0
      else
632
0
      {
633
0
        generate_text_run(ctx, box, flow, mark, text, lang, g);
634
0
      }
635
636
0
      g->at_bol = 0;
637
0
    }
638
0
  }
639
0
}
640
641
static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src)
642
0
{
643
0
  char path[2048];
644
0
  fz_image *img = NULL;
645
0
  fz_buffer *buf = NULL;
646
647
0
  fz_var(img);
648
0
  fz_var(buf);
649
650
0
  fz_try(ctx)
651
0
  {
652
0
    if (!strncmp(src, "data:image/jpeg;base64,", 23))
653
0
      buf = fz_new_buffer_from_base64(ctx, src+23, 0);
654
0
    else if (!strncmp(src, "data:image/png;base64,", 22))
655
0
      buf = fz_new_buffer_from_base64(ctx, src+22, 0);
656
0
    else if (!strncmp(src, "data:image/gif;base64,", 22))
657
0
      buf = fz_new_buffer_from_base64(ctx, src+22, 0);
658
0
    else
659
0
    {
660
0
      fz_strlcpy(path, base_uri, sizeof path);
661
0
      fz_strlcat(path, "/", sizeof path);
662
0
      fz_strlcat(path, src, sizeof path);
663
0
      fz_urldecode(path);
664
0
      fz_cleanname(path);
665
0
      buf = fz_read_archive_entry(ctx, zip, path);
666
0
    }
667
0
#if FZ_ENABLE_SVG
668
0
    if (strstr(src, ".svg"))
669
0
      img = fz_new_image_from_svg(ctx, buf, base_uri, zip);
670
0
    else
671
0
#endif
672
0
      img = fz_new_image_from_buffer(ctx, buf);
673
0
  }
674
0
  fz_always(ctx)
675
0
    fz_drop_buffer(ctx, buf);
676
0
  fz_catch(ctx)
677
0
  {
678
0
    fz_ignore_error(ctx);
679
0
    fz_warn(ctx, "html: cannot load image src='%s'", src);
680
0
  }
681
682
0
  return img;
683
0
}
684
685
static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri,
686
  fz_xml_doc *xmldoc, fz_xml *node)
687
0
{
688
0
  fz_image *img = NULL;
689
0
#if FZ_ENABLE_SVG
690
0
  fz_try(ctx)
691
0
    img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip);
692
0
  fz_catch(ctx)
693
0
  {
694
0
    fz_ignore_error(ctx);
695
0
    fz_warn(ctx, "html: cannot load embedded svg document");
696
0
  }
697
0
#endif
698
0
  return img;
699
0
}
700
701
static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g)
702
0
{
703
0
  fz_html_box *flow;
704
0
  fz_pool *pool = g->pool;
705
706
0
  flow = find_flow_encloser(ctx, box);
707
708
0
  flush_space(ctx, flow, 0, g);
709
710
0
  if (!img)
711
0
  {
712
0
    const char *alt = "[image]";
713
0
    add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
714
0
  }
715
0
  else
716
0
  {
717
0
    fz_try(ctx)
718
0
    {
719
0
      add_flow_sbreak(ctx, pool, flow, box);
720
0
      add_flow_image(ctx, pool, flow, box, img);
721
0
      add_flow_sbreak(ctx, pool, flow, box);
722
0
    }
723
0
    fz_always(ctx)
724
0
    {
725
0
      fz_drop_image(ctx, img);
726
0
    }
727
0
    fz_catch(ctx)
728
0
      fz_rethrow(ctx);
729
0
  }
730
731
0
  g->at_bol = 0;
732
0
}
733
734
static void fz_drop_html_box(fz_context *ctx, fz_html_box *box)
735
0
{
736
0
  while (box)
737
0
  {
738
0
    fz_html_box *next = box->next;
739
0
    if (box->type == BOX_FLOW)
740
0
      fz_drop_html_flow(ctx, box->u.flow.head);
741
0
    fz_drop_html_box(ctx, box->down);
742
0
    box = next;
743
0
  }
744
0
}
745
746
static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor)
747
0
{
748
0
  fz_html *html = (fz_html *)stor;
749
0
  fz_drop_html_box(ctx, html->tree.root);
750
0
  fz_drop_pool(ctx, html->tree.pool);
751
0
}
752
753
static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor)
754
0
{
755
0
  fz_story *story = (fz_story *)stor;
756
0
  fz_free(ctx, story->user_css);
757
0
  fz_drop_html_font_set(ctx, story->font_set);
758
0
  fz_drop_xml(ctx, story->dom);
759
0
  fz_drop_html_box(ctx, story->tree.root);
760
0
  fz_drop_buffer(ctx, story->warnings);
761
0
  fz_drop_archive(ctx, story->zip);
762
  /* The pool must be the last thing dropped. */
763
0
  fz_drop_pool(ctx, story->tree.pool);
764
0
}
765
766
/* Drop a structure derived from an html_tree. The exact things
767
 * freed here will depend upon the drop function with which it
768
 * was created. */
769
static void
770
fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree)
771
0
{
772
0
  fz_defer_reap_start(ctx);
773
0
  fz_drop_storable(ctx, &tree->storable);
774
0
  fz_defer_reap_end(ctx);
775
0
}
776
777
void fz_drop_html(fz_context *ctx, fz_html *html)
778
0
{
779
0
  fz_drop_html_tree(ctx, &html->tree);
780
0
}
781
782
void fz_drop_story(fz_context *ctx, fz_story *story)
783
0
{
784
0
  if (!story)
785
0
    return;
786
787
0
  fz_drop_html_tree(ctx, &story->tree);
788
0
}
789
790
fz_html *fz_keep_html(fz_context *ctx, fz_html *html)
791
0
{
792
0
  return fz_keep_storable(ctx, &html->tree.storable);
793
0
}
794
795
static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style)
796
0
{
797
0
  fz_html_box *box;
798
0
  const char *tag = fz_xml_tag(node);
799
0
  const char *id = fz_xml_att(node, "id");
800
0
  const char *href;
801
802
0
  if (type == BOX_INLINE)
803
0
    box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u));
804
0
  else if (type == BOX_FLOW)
805
0
    box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow));
806
0
  else
807
0
    box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block));
808
809
0
  box->type = type;
810
0
  box->is_first_flow = 0;
811
0
  box->markup_dir = g->markup_dir;
812
0
  box->heading = 0;
813
0
  box->list_item = 0;
814
815
#ifdef DEBUG_HTML_SEQ
816
  {
817
    static int seq = 0;
818
    box->seq = seq++;
819
  }
820
#endif
821
822
0
  box->style = fz_css_enlist(ctx, style, &g->styles, g->pool);
823
824
0
  if (tag)
825
0
  {
826
0
    box->tag = find_known_html_tag(tag);
827
0
    if (!box->tag && g->is_fb2)
828
0
      box->tag = find_known_fb2_tag(tag);
829
0
    if (!box->tag)
830
0
      box->tag = fz_pool_strdup(ctx, g->pool, tag);
831
0
  }
832
0
  else
833
0
  {
834
0
    box->tag = "#anon";
835
0
  }
836
837
0
  if (id)
838
0
    box->id = fz_pool_strdup(ctx, g->pool, id);
839
840
0
  if (tag && tag[0]=='a' && tag[1]==0)
841
0
  {
842
    // Support deprecated anchor syntax with id in "name" instead of "id" attribute.
843
0
    if (!id)
844
0
    {
845
0
      const char *name = fz_xml_att(node, "name");
846
0
      if (name)
847
0
        box->id = fz_pool_strdup(ctx, g->pool, name);
848
0
    }
849
850
0
    if (g->is_fb2)
851
0
    {
852
0
      href = fz_xml_att(node, "l:href");
853
0
      if (!href)
854
0
        href = fz_xml_att(node, "xlink:href");
855
0
    }
856
0
    else
857
0
    {
858
0
      href = fz_xml_att(node, "href");
859
0
    }
860
0
    if (href)
861
0
      g->href = fz_pool_strdup(ctx, g->pool, href);
862
0
  }
863
864
0
  if (g->href)
865
0
    box->href = g->href;
866
867
0
  if (type == BOX_FLOW)
868
0
  {
869
0
    box->u.flow.head = NULL;
870
0
    box->s.build.flow_tail = &box->u.flow.head;
871
0
  }
872
873
0
  return box;
874
0
}
875
876
static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child)
877
0
{
878
0
  child->up = parent;
879
0
  if (!parent->down)
880
0
    parent->down = child;
881
0
  if (parent->s.build.last_child)
882
0
    parent->s.build.last_child->next = child;
883
0
  parent->s.build.last_child = child;
884
0
}
885
886
static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box)
887
0
{
888
0
  while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
889
0
    box = box->up;
890
0
  return box;
891
0
}
892
893
static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box)
894
0
{
895
0
  fz_html_box *look = box;
896
0
  while (look && look->type != BOX_TABLE)
897
0
    look = look->up;
898
0
  if (look)
899
0
    return look;
900
0
  fz_warn(ctx, "table-row not inside table element");
901
0
  return NULL;
902
0
}
903
904
static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box)
905
0
{
906
0
  fz_html_box *look = box;
907
0
  while (look && look->type != BOX_TABLE_ROW)
908
0
    look = look->up;
909
0
  if (look)
910
0
    return look;
911
0
  fz_warn(ctx, "table-cell not inside table-row element");
912
0
  return NULL;
913
0
}
914
915
static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box)
916
0
{
917
0
  fz_css_style style;
918
0
  fz_html_box *flow_box;
919
920
0
  if (box->type == BOX_FLOW || box->type == BOX_INLINE)
921
0
    return box;
922
923
  // We have an inline element that is not in an existing flow/inline context.
924
925
  // Find the closest block level box to insert content into.
926
0
  while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL)
927
0
    box = box->up;
928
929
  // Concatenate onto the last open flow box if we have one.
930
0
  if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW)
931
0
    return box->s.build.last_child;
932
933
  // No flow box found, create and insert one!
934
935
  // TODO: null style instead of default for flow box?
936
0
  fz_default_css_style(ctx, &style);
937
0
  flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style);
938
0
  flow_box->is_first_flow = !box->down;
939
0
  g->at_bol = 1;
940
941
0
  append_box(ctx, box, flow_box);
942
943
0
  return flow_box;
944
0
}
945
946
static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match);
947
948
static void
949
apply_attributes_as_styles(fz_context *ctx, fz_css_style *style, fz_xml *node)
950
0
{
951
0
  const char *att;
952
0
  const char *tag = fz_xml_tag(node);
953
954
0
  if (tag == NULL)
955
0
    return; /* No tag -> no attributes. */
956
957
0
  if (!strcmp(tag, "canvas") ||
958
0
    !strcmp(tag, "embed") ||
959
0
    !strcmp(tag, "iframe") ||
960
0
    !strcmp(tag, "img") ||
961
0
    !strcmp(tag, "input") ||
962
0
    !strcmp(tag, "object") ||
963
0
    !strcmp(tag, "video"))
964
0
  {
965
0
    att = fz_xml_att(node, "width");
966
0
    if (att)
967
0
    {
968
0
      style->width.value = fz_atof(att);
969
0
      if (strchr(att,'%'))
970
0
        style->width.unit = N_PERCENT;
971
0
      else
972
0
        style->width.unit = N_LENGTH;
973
0
    }
974
975
0
    att = fz_xml_att(node, "height");
976
0
    if (att)
977
0
    {
978
0
      style->height.value = fz_atof(att);
979
0
      if (strchr(att,'%'))
980
0
        style->height.unit = N_PERCENT;
981
0
      else
982
0
        style->height.unit = N_LENGTH;
983
0
    }
984
0
  }
985
986
0
  att = fz_xml_att(node, "valign");
987
0
  if (!att)
988
0
  {}
989
0
  else if (!strcmp(att, "top"))
990
0
    style->vertical_align = VA_TOP;
991
0
  else if (!strcmp(att, "middle"))
992
0
    style->vertical_align = VA_MIDDLE;
993
0
  else if (!strcmp(att, "bottom"))
994
0
    style->vertical_align = VA_BOTTOM;
995
0
  else if (!strcmp(att, "baseline"))
996
0
    style->vertical_align = VA_BASELINE;
997
998
0
  if (!strcmp(tag, "td") ||
999
0
    !strcmp(tag, "th"))
1000
0
  {
1001
0
    att = fz_xml_att(node, "rowspan");
1002
0
    if (att)
1003
0
    {
1004
0
      int i = fz_atoi(att);
1005
0
      style->rowspan = fz_clampi(i, 1, 1000);
1006
0
    }
1007
1008
0
    att = fz_xml_att(node, "colspan");
1009
0
    if (att)
1010
0
    {
1011
0
      int i = fz_atoi(att);
1012
0
      style->colspan = fz_clampi(i, 1, 1000);
1013
0
    }
1014
0
  }
1015
1016
  /* FIXME: We probably need to vary this based on node type;
1017
   * for images, it'd need to be "float:left" etc. */
1018
0
  att = fz_xml_att(node, "align");
1019
0
  if (!att)
1020
0
  {}
1021
0
  else if (!strcmp(att, "left"))
1022
0
    style->text_align = TA_LEFT;
1023
0
  else if (!strcmp(att, "right"))
1024
0
    style->text_align = TA_RIGHT;
1025
0
  else if (!strcmp(att, "center"))
1026
0
    style->text_align = TA_CENTER;
1027
0
  else if (!strcmp(att, "justify"))
1028
0
    style->text_align = TA_JUSTIFY;
1029
1030
0
  att = fz_xml_att(node, "bgcolor");
1031
0
  if (att)
1032
0
    style->background_color = fz_css_color_from_string(att);
1033
1034
0
  att = fz_xml_att(node, "border");
1035
0
  if (att)
1036
0
  {
1037
0
    style->border_width[3].unit = style->border_width[2].unit = style->border_width[1].unit = style->border_width[0].unit = N_LENGTH;
1038
0
    style->border_width[3].value = style->border_width[2].value = style->border_width[1].value = style->border_width[0].value = fz_atof(att);
1039
0
  }
1040
1041
0
  att = fz_xml_att(node, "hidden");
1042
0
  if(att)
1043
0
    style->visibility = V_HIDDEN;
1044
0
}
1045
1046
static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
1047
0
{
1048
0
  fz_html_box *anon_box;
1049
0
  fz_css_style style;
1050
0
  const char *text;
1051
0
  int collapse;
1052
1053
0
  text = fz_xml_text(node);
1054
0
  collapse = root_box->style->white_space & WS_COLLAPSE;
1055
0
  if (collapse && is_all_white(text))
1056
0
  {
1057
0
    g->emit_white = root_box;
1058
0
  }
1059
0
  else
1060
0
  {
1061
0
    if (root_box->type != BOX_INLINE)
1062
0
    {
1063
      /* Create anonymous inline box, with the same style as the top block box. */
1064
0
      style = *root_box->style;
1065
1066
      // Make sure not to recursively multiply font sizes
1067
0
      style.font_size.value = 1;
1068
0
      style.font_size.unit = N_SCALE;
1069
1070
0
      root_box = find_inline_context(ctx, g, root_box);
1071
0
      anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style);
1072
0
      append_box(ctx, root_box, anon_box);
1073
0
      root_box = anon_box;
1074
0
    }
1075
1076
0
    generate_text(ctx, root_box, text, g->markup_lang, g);
1077
0
  }
1078
0
}
1079
1080
static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
1081
0
{
1082
0
  fz_html_box *this_box;
1083
0
  fz_html_box *flow_box;
1084
0
  root_box = find_inline_context(ctx, g, root_box);
1085
0
  this_box = new_box(ctx, g, node, BOX_INLINE, style);
1086
0
  append_box(ctx, root_box, this_box);
1087
0
  if (this_box->id)
1088
0
  {
1089
0
    flow_box = find_flow_encloser(ctx, this_box);
1090
0
    add_flow_anchor(ctx, g->pool, flow_box, this_box);
1091
0
  }
1092
0
  return this_box;
1093
0
}
1094
1095
static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node)
1096
0
{
1097
0
  fz_html_box *this_box;
1098
0
  fz_html_box *flow_box;
1099
1100
0
  if (root_box->type != BOX_INLINE)
1101
0
  {
1102
    /* Create inline box to hold the <br> tag, with the same style as containing block. */
1103
    /* Make sure not to recursively multiply font sizes. */
1104
0
    fz_css_style style = *root_box->style;
1105
0
    style.font_size.value = 1;
1106
0
    style.font_size.unit = N_SCALE;
1107
0
    this_box = new_box(ctx, g, node, BOX_INLINE, &style);
1108
0
    append_box(ctx, find_inline_context(ctx, g, root_box), this_box);
1109
0
  }
1110
0
  else
1111
0
  {
1112
0
    this_box = root_box;
1113
0
  }
1114
1115
0
  flow_box = find_flow_encloser(ctx, this_box);
1116
0
  add_flow_break(ctx, g->pool, flow_box, this_box);
1117
0
  g->at_bol = 1;
1118
0
}
1119
1120
static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
1121
0
{
1122
0
  fz_html_box *this_box;
1123
0
  root_box = find_block_context(ctx, root_box);
1124
0
  this_box = new_box(ctx, g, node, BOX_BLOCK, style);
1125
0
  append_box(ctx, root_box, this_box);
1126
0
  return this_box;
1127
0
}
1128
1129
static void
1130
push_colstyle(fz_context *ctx, table_styles *ts, col_style cs)
1131
0
{
1132
0
  if (ts->ncols == ts->maxcols)
1133
0
  {
1134
0
    int newmax = ts->maxcols * 2;
1135
0
    if (newmax == 0)
1136
0
      newmax = 8;
1137
0
    ts->styles = fz_realloc(ctx, ts->styles, sizeof(ts->styles[0]) * newmax);
1138
0
    ts->maxcols = newmax;
1139
0
  }
1140
1141
0
  ts->styles[ts->ncols++] = cs;
1142
0
}
1143
1144
static void gen2_col(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_match *match)
1145
0
{
1146
0
  const char *span = fz_xml_att(node, "span");
1147
0
  col_style cs = { 0 };
1148
0
  int i;
1149
0
  int n = span ? fz_atoi(span) : 1;
1150
0
  if (n < 1)
1151
0
    n = 1;
1152
1153
  /* Get the col styles. */
1154
0
  fz_css_colstyle(&cs, match);
1155
1156
  /* FIXME: width attr ? */
1157
1158
0
  for (i = 0; i < n; i++)
1159
0
    push_colstyle(ctx, &g->tab_styles, cs);
1160
0
}
1161
1162
/* All this does is give us a warning if we fail to be in a table. */
1163
static void gen2_colgroup(fz_context *ctx, fz_html_box *root_box)
1164
0
{
1165
0
  fz_html_box *look = root_box;
1166
0
  while (look && look->type != BOX_TABLE)
1167
0
    look = look->up;
1168
0
  if (!look)
1169
0
    fz_warn(ctx, "colgroup not inside table element");
1170
0
}
1171
1172
static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
1173
0
{
1174
0
  fz_html_box *this_box;
1175
0
  root_box = find_block_context(ctx, root_box);
1176
0
  this_box = new_box(ctx, g, node, BOX_TABLE, style);
1177
0
  append_box(ctx, root_box, this_box);
1178
0
  return this_box;
1179
0
}
1180
1181
static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style)
1182
0
{
1183
0
  fz_html_box *this_box, *table_box;
1184
1185
0
  table_box = find_table_row_context(ctx, root_box);
1186
0
  if (!table_box)
1187
0
    return gen2_block(ctx, g, root_box, node, style);
1188
1189
0
  this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style);
1190
0
  append_box(ctx, table_box, this_box);
1191
0
  g->col_num = 0;
1192
0
  return this_box;
1193
0
}
1194
1195
static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style, fz_css_match *root_match)
1196
0
{
1197
0
  fz_html_box *this_box, *row_box;
1198
0
  fz_css_style style2;
1199
0
  fz_css_match match;
1200
1201
0
  row_box = find_table_cell_context(ctx, root_box);
1202
0
  if (!row_box)
1203
0
    return gen2_block(ctx, g, root_box, node, style);
1204
1205
0
  fz_match_css(ctx, &match, root_match, g->css, node);
1206
0
  fz_apply_css_style(ctx, g->set, style, &match);
1207
0
  if (g->col_num < g->tab_styles.ncols)
1208
0
  {
1209
    /* Make a local copy of the style, and overlay anything onto it from col. */
1210
0
    col_style *cs = &g->tab_styles.styles[g->col_num];
1211
0
    style2 = *style;
1212
0
    style = &style2;
1213
0
    if (cs->has_bg_col)
1214
0
      style->background_color = cs->background_color;
1215
0
    if (cs->has_border_col & 1)
1216
0
      style->border_color[0] = cs->border_color[0];
1217
0
    if (cs->has_border_col & 2)
1218
0
      style->border_color[1] = cs->border_color[1];
1219
0
    if (cs->has_border_col & 4)
1220
0
      style->border_color[2] = cs->border_color[2];
1221
0
    if (cs->has_border_col & 8)
1222
0
      style->border_color[3] = cs->border_color[3];
1223
0
    if (cs->has_border_width & 1)
1224
0
      style->border_width[0] = cs->border_width[0];
1225
0
    if (cs->has_border_width & 2)
1226
0
      style->border_width[1] = cs->border_width[1];
1227
0
    if (cs->has_border_width & 4)
1228
0
      style->border_width[2] = cs->border_width[2];
1229
0
    if (cs->has_border_width & 8)
1230
0
      style->border_width[3] = cs->border_width[3];
1231
0
    if (cs->has_visibility)
1232
0
      style->visibility = cs->visibility;
1233
0
    if (cs->has_width)
1234
0
      style->width = cs->width;
1235
0
  }
1236
0
  apply_attributes_as_styles(ctx, style, node);
1237
0
  g->col_num++;
1238
1239
0
  this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style);
1240
0
  append_box(ctx, row_box, this_box);
1241
0
  return this_box;
1242
0
}
1243
1244
static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style)
1245
0
{
1246
0
  fz_html_box *img_block_box;
1247
0
  fz_html_box *img_inline_box;
1248
1249
0
  if (display == DIS_INLINE || display == DIS_INLINE_BLOCK)
1250
0
  {
1251
0
    root_box = find_inline_context(ctx, g, root_box);
1252
0
    img_inline_box = new_box(ctx, g, node, BOX_INLINE, style);
1253
0
    append_box(ctx, root_box, img_inline_box);
1254
0
    generate_image(ctx, img_inline_box, img, g);
1255
0
  }
1256
0
  else
1257
0
  {
1258
0
    root_box = find_block_context(ctx, root_box);
1259
0
    img_block_box = new_box(ctx, g, node, BOX_BLOCK, style);
1260
0
    append_box(ctx, root_box, img_block_box);
1261
1262
0
    root_box = find_inline_context(ctx, g, img_block_box);
1263
0
    img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style);
1264
0
    append_box(ctx, root_box, img_inline_box);
1265
0
    generate_image(ctx, img_inline_box, img, g);
1266
0
  }
1267
0
}
1268
1269
static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
1270
0
{
1271
0
  const char *src = fz_xml_att(node, "src");
1272
0
  if (src)
1273
0
  {
1274
0
    fz_css_style local_style = *style;
1275
0
    fz_image *img;
1276
0
    int w, h;
1277
0
    const char *w_att = fz_xml_att(node, "width");
1278
0
    const char *h_att = fz_xml_att(node, "height");
1279
1280
0
    if (w_att && (w = fz_atoi(w_att)) > 0)
1281
0
    {
1282
0
      local_style.width.value = w;
1283
0
      local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH;
1284
0
    }
1285
0
    if (h_att && (h = fz_atoi(h_att)) > 0)
1286
0
    {
1287
0
      local_style.height.value = h;
1288
0
      local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH;
1289
0
    }
1290
1291
0
    img = load_html_image(ctx, g->zip, g->base_uri, src);
1292
0
    gen2_image_common(ctx, g, root_box, node, img, display, &local_style);
1293
0
  }
1294
0
}
1295
1296
static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
1297
0
{
1298
0
  const char *src = fz_xml_att(node, "l:href");
1299
0
  if (!src)
1300
0
    src = fz_xml_att(node, "xlink:href");
1301
0
  if (src && src[0] == '#')
1302
0
  {
1303
0
    fz_image *img = fz_tree_lookup(ctx, g->images, src+1);
1304
0
    gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style);
1305
0
  }
1306
0
}
1307
1308
static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style)
1309
0
{
1310
0
  fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node);
1311
0
  gen2_image_common(ctx, g, root_box, node, img, display, style);
1312
0
}
1313
1314
static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag)
1315
0
{
1316
0
  if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0)
1317
0
  {
1318
0
    switch (tag[1])
1319
0
    {
1320
0
    case '1': return 1;
1321
0
    case '2': return 2;
1322
0
    case '3': return 3;
1323
0
    case '4': return 4;
1324
0
    case '5': return 5;
1325
0
    case '6': return 6;
1326
0
    }
1327
0
  }
1328
0
  if (g->is_fb2)
1329
0
  {
1330
0
    if (!strcmp(tag, "title") || !strcmp(tag, "subtitle"))
1331
0
      return fz_mini(g->section_depth, 6);
1332
0
  }
1333
0
  return 0;
1334
0
}
1335
1336
static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node,
1337
  fz_css_match *match, int display, fz_css_style *style)
1338
0
{
1339
0
  fz_html_box *this_box = NULL;
1340
0
  const char *tag;
1341
0
  const char *lang_att;
1342
0
  const char *dir_att;
1343
1344
0
  int save_markup_dir = g->markup_dir;
1345
0
  int save_markup_lang = g->markup_lang;
1346
0
  char *save_href = g->href;
1347
1348
0
  if (display == DIS_NONE)
1349
0
    return;
1350
1351
0
  tag = fz_xml_tag(node);
1352
1353
0
  if (style->direction == FZ_BIDI_UNSET)
1354
0
  {
1355
0
    dir_att = fz_xml_att(node, "dir");
1356
0
    if (dir_att)
1357
0
    {
1358
0
      if (!strcmp(dir_att, "auto"))
1359
0
        g->markup_dir = FZ_BIDI_NEUTRAL;
1360
0
      else if (!strcmp(dir_att, "rtl"))
1361
0
        g->markup_dir = FZ_BIDI_RTL;
1362
0
      else if (!strcmp(dir_att, "ltr"))
1363
0
        g->markup_dir = FZ_BIDI_LTR;
1364
0
      else
1365
0
        g->markup_dir = FZ_BIDI_LTR;
1366
0
    }
1367
0
  }
1368
0
  else
1369
0
  {
1370
0
    g->markup_dir = style->direction;
1371
0
  }
1372
1373
0
  lang_att = fz_xml_att(node, "lang");
1374
0
  if (lang_att)
1375
0
    g->markup_lang = fz_text_language_from_string(lang_att);
1376
1377
0
  switch (display)
1378
0
  {
1379
0
  case DIS_INLINE_BLOCK:
1380
    // TODO handle inline block as a flow node
1381
0
    this_box = gen2_block(ctx, g, root_box, node, style);
1382
0
    break;
1383
1384
0
  case DIS_BLOCK:
1385
0
    this_box = gen2_block(ctx, g, root_box, node, style);
1386
0
    this_box->heading = get_heading_from_tag(ctx, g, tag);
1387
0
    break;
1388
1389
0
  case DIS_LIST_ITEM:
1390
0
    this_box = gen2_block(ctx, g, root_box, node, style);
1391
0
    this_box->list_item = ++g->list_counter;
1392
0
    break;
1393
1394
  // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes
1395
  //
1396
  // The table generation code should insert and create anonymous boxes
1397
  // for any missing child/parent elements.
1398
  //
1399
  // MISSING CHILDREN:
1400
  // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW.
1401
  // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL.
1402
  //
1403
  // MISSING PARENTS:
1404
  // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW
1405
  // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE
1406
  //
1407
  // For now we ignore this and treat any such elements that are out of
1408
  // context as plain block elements.
1409
1410
0
  case DIS_TABLE:
1411
0
    this_box = gen2_table(ctx, g, root_box, node, style);
1412
0
    break;
1413
0
  case DIS_TABLE_GROUP:
1414
    // no box for table-row-group elements
1415
0
    this_box = root_box;
1416
0
    break;
1417
0
  case DIS_TABLE_ROW:
1418
0
    this_box = gen2_table_row(ctx, g, root_box, node, style);
1419
0
    break;
1420
0
  case DIS_TABLE_CELL:
1421
0
    this_box = gen2_table_cell(ctx, g, root_box, node, style, match);
1422
0
    break;
1423
1424
0
  case DIS_TABLE_COLGROUP:
1425
0
    gen2_colgroup(ctx, root_box);
1426
    // no box for colgroup elements.
1427
0
    this_box = root_box;
1428
0
    break;
1429
1430
0
  case DIS_INLINE:
1431
0
  default:
1432
0
    this_box = gen2_inline(ctx, g, root_box, node, style);
1433
0
    break;
1434
0
  }
1435
1436
0
  if (this_box == NULL)
1437
0
    goto end;
1438
1439
0
  if (tag && (!strcmp(tag, "ol") || !strcmp(tag, "ul") || !strcmp(tag, "dl")))
1440
0
  {
1441
0
    int save_list_counter = g->list_counter;
1442
0
    g->list_counter = 0;
1443
0
    gen2_children(ctx, g, this_box, node, match);
1444
0
    g->list_counter = save_list_counter;
1445
0
  }
1446
0
  else if (tag && !strcmp(tag, "section"))
1447
0
  {
1448
0
    int save_section_depth = g->section_depth;
1449
0
    g->section_depth++;
1450
0
    gen2_children(ctx, g, this_box, node, match);
1451
0
    g->section_depth = save_section_depth;
1452
0
  }
1453
0
  else if (display == DIS_TABLE)
1454
0
  {
1455
0
    table_styles saved_styles = g->tab_styles;
1456
0
    int saved_col_num = g->col_num;
1457
0
    fz_try(ctx)
1458
0
    {
1459
0
      g->tab_styles.maxcols = 0;
1460
0
      g->tab_styles.ncols = 0;
1461
0
      g->tab_styles.styles = NULL;
1462
0
      gen2_children(ctx, g, this_box, node, match);
1463
0
    }
1464
0
    fz_always(ctx)
1465
0
    {
1466
0
      drop_table_styles(ctx, &g->tab_styles);
1467
0
      g->tab_styles = saved_styles;
1468
0
      g->col_num = saved_col_num;
1469
0
    }
1470
0
    fz_catch(ctx)
1471
0
      fz_rethrow(ctx);
1472
0
  }
1473
0
  else
1474
0
  {
1475
0
    gen2_children(ctx, g, this_box, node, match);
1476
0
  }
1477
1478
0
end:
1479
0
  g->markup_dir = save_markup_dir;
1480
0
  g->markup_lang = save_markup_lang;
1481
0
  g->href = save_href;
1482
0
}
1483
1484
static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match)
1485
0
{
1486
0
  fz_xml *node;
1487
0
  const char *tag;
1488
0
  fz_css_match match;
1489
0
  fz_css_style style;
1490
0
  int display;
1491
1492
0
  for (node = fz_xml_down(root_node); node; node = fz_xml_next(node))
1493
0
  {
1494
0
    tag = fz_xml_tag(node);
1495
0
    if (tag)
1496
0
    {
1497
0
      fz_match_css(ctx, &match, root_match, g->css, node);
1498
0
      fz_apply_css_style(ctx, g->set, &style, &match);
1499
0
      apply_attributes_as_styles(ctx, &style, node);
1500
0
      display = fz_get_css_match_display(&match);
1501
0
      if (tag[0]=='b' && tag[1]=='r' && tag[2]==0)
1502
0
      {
1503
0
        gen2_break(ctx, g, root_box, node);
1504
0
      }
1505
0
      else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0)
1506
0
      {
1507
0
        gen2_image_html(ctx, g, root_box, node, display, &style);
1508
0
      }
1509
0
      else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0)
1510
0
      {
1511
0
        gen2_image_fb2(ctx, g, root_box, node, display, &style);
1512
0
      }
1513
0
      else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0)
1514
0
      {
1515
0
        gen2_image_svg(ctx, g, root_box, node, display, &style);
1516
0
      }
1517
0
      else if (tag[0]=='c' && tag[1]=='o' && tag[2]=='l' && tag[3]==0)
1518
0
      {
1519
0
        gen2_col(ctx, g, root_box, node, &match);
1520
0
      }
1521
0
      else
1522
0
      {
1523
0
        gen2_tag(ctx, g, root_box, node, &match, display, &style);
1524
0
      }
1525
0
    }
1526
0
    else
1527
0
    {
1528
0
      gen2_text(ctx, g, root_box, node);
1529
0
    }
1530
0
  }
1531
0
}
1532
1533
static void
1534
html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href)
1535
0
{
1536
0
  char path[2048];
1537
0
  char css_base_uri[2048];
1538
0
  fz_buffer *buf;
1539
1540
0
  fz_var(buf);
1541
1542
0
  fz_strlcpy(path, base_uri, sizeof path);
1543
0
  fz_strlcat(path, "/", sizeof path);
1544
0
  fz_strlcat(path, href, sizeof path);
1545
0
  fz_urldecode(path);
1546
0
  fz_cleanname(path);
1547
1548
0
  fz_dirname(css_base_uri, path, sizeof css_base_uri);
1549
1550
0
  buf = NULL;
1551
0
  fz_try(ctx)
1552
0
  {
1553
0
    buf = fz_read_archive_entry(ctx, zip, path);
1554
0
    fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path);
1555
0
    fz_add_css_font_faces(ctx, set, zip, css_base_uri, css);
1556
0
  }
1557
0
  fz_always(ctx)
1558
0
    fz_drop_buffer(ctx, buf);
1559
0
  fz_catch(ctx)
1560
0
  {
1561
0
    fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1562
0
    fz_report_error(ctx);
1563
0
    fz_warn(ctx, "ignoring stylesheet %s", path);
1564
0
  }
1565
0
}
1566
1567
static void
1568
html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1569
0
{
1570
0
  fz_xml *html, *head, *node;
1571
1572
0
  html = fz_xml_find(root, "html");
1573
0
  head = fz_xml_find_down(html, "head");
1574
0
  for (node = fz_xml_down(head); node; node = fz_xml_next(node))
1575
0
  {
1576
0
    if (fz_xml_is_tag(node, "link"))
1577
0
    {
1578
0
      char *rel = fz_xml_att(node, "rel");
1579
0
      if (rel && !fz_strcasecmp(rel, "stylesheet"))
1580
0
      {
1581
0
        char *type = fz_xml_att(node, "type");
1582
0
        if ((type && !strcmp(type, "text/css")) || !type)
1583
0
        {
1584
0
          char *href = fz_xml_att(node, "href");
1585
0
          if (href)
1586
0
          {
1587
0
            html_load_css_link(ctx, set, zip, base_uri, css, root, href);
1588
0
          }
1589
0
        }
1590
0
      }
1591
0
    }
1592
0
    else if (fz_xml_is_tag(node, "style"))
1593
0
    {
1594
0
      char *s = fz_new_text_from_xml(ctx, node);
1595
0
      fz_try(ctx)
1596
0
      {
1597
0
        fz_parse_css(ctx, css, s, "<style>");
1598
0
        fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1599
0
      }
1600
0
      fz_always(ctx)
1601
0
        fz_free(ctx, s);
1602
0
      fz_catch(ctx)
1603
0
      {
1604
0
        fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1605
0
        fz_report_error(ctx);
1606
0
        fz_warn(ctx, "ignoring inline stylesheet");
1607
0
      }
1608
0
    }
1609
0
  }
1610
0
}
1611
1612
static void
1613
fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1614
0
{
1615
0
  fz_xml *fictionbook, *stylesheet;
1616
1617
0
  fictionbook = fz_xml_find(root, "FictionBook");
1618
0
  stylesheet = fz_xml_find_down(fictionbook, "stylesheet");
1619
0
  if (stylesheet)
1620
0
  {
1621
0
    char *s = fz_new_text_from_xml(ctx, stylesheet);
1622
0
    fz_try(ctx)
1623
0
    {
1624
0
      fz_parse_css(ctx, css, s, "<stylesheet>");
1625
0
      fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1626
0
    }
1627
0
    fz_catch(ctx)
1628
0
    {
1629
0
      fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1630
0
      fz_report_error(ctx);
1631
0
      fz_warn(ctx, "ignoring inline stylesheet");
1632
0
    }
1633
0
    fz_free(ctx, s);
1634
0
  }
1635
0
}
1636
1637
static fz_tree *
1638
load_fb2_images(fz_context *ctx, fz_xml *root)
1639
0
{
1640
0
  fz_xml *fictionbook, *binary;
1641
0
  fz_tree *images = NULL;
1642
1643
0
  fictionbook = fz_xml_find(root, "FictionBook");
1644
0
  for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary"))
1645
0
  {
1646
0
    const char *id = fz_xml_att(binary, "id");
1647
0
    char *b64 = NULL;
1648
0
    fz_buffer *buf = NULL;
1649
0
    fz_image *img = NULL;
1650
1651
0
    fz_var(b64);
1652
0
    fz_var(buf);
1653
1654
0
    if (id == NULL)
1655
0
    {
1656
0
      fz_warn(ctx, "Skipping image with no id");
1657
0
      continue;
1658
0
    }
1659
1660
0
    fz_try(ctx)
1661
0
    {
1662
0
      b64 = fz_new_text_from_xml(ctx, binary);
1663
0
      buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64));
1664
0
      img = fz_new_image_from_buffer(ctx, buf);
1665
0
    }
1666
0
    fz_always(ctx)
1667
0
    {
1668
0
      fz_drop_buffer(ctx, buf);
1669
0
      fz_free(ctx, b64);
1670
0
    }
1671
0
    fz_catch(ctx)
1672
0
      fz_rethrow(ctx);
1673
1674
0
    images = fz_tree_insert(ctx, images, id, img);
1675
0
  }
1676
1677
0
  return images;
1678
0
}
1679
1680
typedef struct
1681
{
1682
  uint32_t *data;
1683
  size_t cap;
1684
  size_t len;
1685
} uni_buf;
1686
1687
typedef struct
1688
{
1689
  fz_context *ctx;
1690
  fz_pool *pool;
1691
  fz_html_flow *flow;
1692
  uni_buf *buffer;
1693
} bidi_data;
1694
1695
static void fragment_cb(const uint32_t *fragment,
1696
      size_t fragment_len,
1697
      int bidi_level,
1698
      int script,
1699
      void *arg)
1700
0
{
1701
0
  bidi_data *data = (bidi_data *)arg;
1702
1703
  /* We are guaranteed that fragmentOffset will be at the beginning
1704
   * of flow. */
1705
0
  while (fragment_len > 0)
1706
0
  {
1707
0
    size_t len;
1708
1709
0
    if (data->flow->type == FLOW_SPACE)
1710
0
    {
1711
0
      len = 1;
1712
0
    }
1713
0
    else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK ||
1714
0
        data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR)
1715
0
    {
1716
0
      len = 0;
1717
0
    }
1718
0
    else
1719
0
    {
1720
      /* Must be text */
1721
0
      len = fz_utflen(data->flow->content.text);
1722
0
      if (len > fragment_len)
1723
0
      {
1724
        /* We need to split this flow box */
1725
0
        (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len);
1726
0
        len = fz_utflen(data->flow->content.text);
1727
0
      }
1728
0
    }
1729
1730
    /* This flow box is entirely contained within this fragment. */
1731
0
    data->flow->bidi_level = bidi_level;
1732
0
    data->flow->script = script;
1733
0
    data->flow = data->flow->next;
1734
0
    fragment_len -= len;
1735
0
  }
1736
0
}
1737
1738
static fz_bidi_direction
1739
detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow)
1740
0
{
1741
0
  fz_html_flow *end = flow;
1742
0
  bidi_data data;
1743
1744
0
  while (end)
1745
0
  {
1746
0
    unsigned int level = end->bidi_level;
1747
1748
    /* Gather the text from the flow up into a single buffer (at
1749
     * least, as much of it as has the same direction markup). */
1750
0
    buffer->len = 0;
1751
0
    while (end && (level & 1) == (end->bidi_level & 1))
1752
0
    {
1753
0
      size_t len = 0;
1754
0
      const char *text = "";
1755
0
      int broken = 0;
1756
1757
0
      switch (end->type)
1758
0
      {
1759
0
      case FLOW_WORD:
1760
0
        len = fz_utflen(end->content.text);
1761
0
        text = end->content.text;
1762
0
        break;
1763
0
      case FLOW_SPACE:
1764
0
        len = 1;
1765
0
        text = " ";
1766
0
        break;
1767
0
      case FLOW_SHYPHEN:
1768
0
      case FLOW_SBREAK:
1769
0
        break;
1770
0
      case FLOW_BREAK:
1771
0
      case FLOW_IMAGE:
1772
0
        broken = 1;
1773
0
        break;
1774
0
      }
1775
1776
0
      end = end->next;
1777
1778
0
      if (broken)
1779
0
        break;
1780
1781
      /* Make sure the buffer is large enough */
1782
0
      if (buffer->len + len > buffer->cap)
1783
0
      {
1784
0
        size_t newcap = buffer->cap;
1785
0
        if (newcap < 128)
1786
0
          newcap = 128; /* Sensible small default */
1787
1788
0
        while (newcap < buffer->len + len)
1789
0
          newcap = (newcap * 3) / 2;
1790
1791
0
        buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t);
1792
0
        buffer->cap = newcap;
1793
0
      }
1794
1795
      /* Expand the utf8 text into Unicode and store it in the buffer */
1796
0
      while (*text)
1797
0
      {
1798
0
        int rune;
1799
0
        text += fz_chartorune(&rune, text);
1800
0
        buffer->data[buffer->len++] = rune;
1801
0
      }
1802
0
    }
1803
1804
    /* Detect directionality for the buffer */
1805
0
    data.ctx = ctx;
1806
0
    data.pool = pool;
1807
0
    data.flow = flow;
1808
0
    data.buffer = buffer;
1809
0
    fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */);
1810
0
    flow = end;
1811
0
  }
1812
0
  return bidi_dir;
1813
0
}
1814
1815
static void
1816
detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box)
1817
0
{
1818
0
  while (box)
1819
0
  {
1820
0
    if (box->type == BOX_FLOW)
1821
0
      box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head);
1822
0
    detect_box_directionality(ctx, pool, buffer, box->down);
1823
0
    box = box->next;
1824
0
  }
1825
0
}
1826
1827
static void
1828
detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box)
1829
0
{
1830
0
  uni_buf buffer = { NULL };
1831
1832
0
  fz_try(ctx)
1833
0
    detect_box_directionality(ctx, pool, &buffer, box);
1834
0
  fz_always(ctx)
1835
0
    fz_free(ctx, buffer.data);
1836
0
  fz_catch(ctx)
1837
0
    fz_rethrow(ctx);
1838
0
}
1839
1840
static fz_xml_doc *
1841
parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5)
1842
0
{
1843
0
  fz_xml_doc *xml;
1844
1845
0
  if (try_xml && try_html5)
1846
0
  {
1847
0
    fz_try(ctx)
1848
0
      xml = fz_parse_xml(ctx, buf, 1);
1849
0
    fz_catch(ctx)
1850
0
    {
1851
0
      if (fz_caught(ctx) == FZ_ERROR_SYNTAX)
1852
0
      {
1853
0
        fz_report_error(ctx);
1854
0
        fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser");
1855
0
        xml = fz_parse_xml_from_html5(ctx, buf);
1856
0
      }
1857
0
      else
1858
0
        fz_rethrow(ctx);
1859
0
    }
1860
0
  }
1861
0
  else if (try_xml)
1862
0
    xml = fz_parse_xml(ctx, buf, 1);
1863
0
  else
1864
0
  {
1865
0
    assert(try_html5);
1866
0
    xml = fz_parse_xml_from_html5(ctx, buf);
1867
0
  }
1868
1869
0
  return xml;
1870
0
}
1871
1872
static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from)
1873
0
{
1874
0
  fz_css_color transparent = { 0, 0, 0, 0 };
1875
0
  fz_css_style s1, s2;
1876
0
  memcpy(&s1, root->style, sizeof s1);
1877
0
  memcpy(&s2, from->style, sizeof s2);
1878
0
  s1.background_color = s2.background_color;
1879
0
  s2.background_color = transparent;
1880
0
  root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool);
1881
0
  from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool);
1882
0
}
1883
1884
static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root)
1885
0
{
1886
0
  fz_html_box *html, *body;
1887
1888
0
  if (root->style->background_color.a != 0)
1889
0
  {
1890
0
    return;
1891
0
  }
1892
1893
0
  html = root->down;
1894
0
  if (html && !strcmp(html->tag, "html"))
1895
0
  {
1896
0
    if (html->style->background_color.a != 0)
1897
0
    {
1898
0
      move_background_color_style_up(ctx, g, root, html);
1899
0
      return;
1900
0
    }
1901
1902
0
    body = html->down;
1903
0
    if (body && !strcmp(body->tag, "body"))
1904
0
    {
1905
0
      if (body->style->background_color.a != 0)
1906
0
      {
1907
0
        move_background_color_style_up(ctx, g, root, body);
1908
0
        return;
1909
0
      }
1910
0
    }
1911
0
  }
1912
0
}
1913
1914
static void
1915
xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css,
1916
  fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi)
1917
0
{
1918
0
  fz_xml *root, *node;
1919
0
  char *title;
1920
1921
0
  fz_css_match root_match, match;
1922
0
  struct genstate g = {0};
1923
1924
0
  g.pool = NULL;
1925
0
  g.set = set;
1926
0
  g.zip = zip;
1927
0
  g.images = NULL;
1928
0
  g.xml = xml;
1929
0
  g.is_fb2 = 0;
1930
0
  g.base_uri = base_uri;
1931
0
  g.css = NULL;
1932
0
  g.at_bol = 0;
1933
0
  g.emit_white = 0;
1934
0
  g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP;
1935
0
  g.list_counter = 0;
1936
0
  g.section_depth = 0;
1937
0
  g.markup_dir = FZ_BIDI_LTR;
1938
0
  g.markup_lang = FZ_LANG_UNSET;
1939
0
  g.href = NULL;
1940
0
  g.styles = NULL;
1941
1942
0
  if (rtitle)
1943
0
    *rtitle = NULL;
1944
1945
0
  root = fz_xml_root(g.xml);
1946
0
  g.css = fz_new_css(ctx);
1947
1948
0
#ifndef NDEBUG
1949
0
  if (fz_atoi(getenv("FZ_DEBUG_XML")))
1950
0
    fz_debug_xml(root, 0);
1951
0
#endif
1952
1953
0
  fz_try(ctx)
1954
0
  {
1955
0
    if (try_fictionbook && fz_xml_find(root, "FictionBook"))
1956
0
    {
1957
0
      g.is_fb2 = 1;
1958
0
      fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>");
1959
0
      if (fz_use_document_css(ctx))
1960
0
        fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1961
0
      g.images = load_fb2_images(ctx, root);
1962
0
    }
1963
0
    else if (is_mobi)
1964
0
    {
1965
0
      g.is_fb2 = 0;
1966
0
      fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1967
0
      fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>");
1968
0
      if (fz_use_document_css(ctx))
1969
0
        html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1970
0
    }
1971
0
    else
1972
0
    {
1973
0
      g.is_fb2 = 0;
1974
0
      fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1975
0
      if (fz_use_document_css(ctx))
1976
0
        html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1977
0
    }
1978
1979
0
    if (user_css)
1980
0
    {
1981
0
      fz_parse_css(ctx, g.css, user_css, "<user>");
1982
0
      fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css);
1983
0
    }
1984
0
  }
1985
0
  fz_catch(ctx)
1986
0
  {
1987
0
    drop_table_styles(ctx, &g.tab_styles);
1988
0
    fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
1989
0
    fz_drop_css(ctx, g.css);
1990
0
    fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1991
0
    fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
1992
0
    fz_report_error(ctx);
1993
0
    fz_warn(ctx, "ignoring styles");
1994
0
    g.css = fz_new_css(ctx);
1995
0
    g.images = NULL;
1996
0
  }
1997
1998
0
#ifndef NDEBUG
1999
0
  if (fz_atoi(getenv("FZ_DEBUG_CSS")))
2000
0
    fz_debug_css(ctx, g.css);
2001
0
#endif
2002
2003
0
  fz_try(ctx)
2004
0
  {
2005
0
    fz_css_style style;
2006
0
    int display;
2007
2008
0
    fz_match_css_at_page(ctx, &root_match, g.css);
2009
0
    fz_apply_css_style(ctx, g.set, &style, &root_match);
2010
2011
0
    g.pool = tree->pool;
2012
0
    if (style.direction != FZ_BIDI_UNSET)
2013
0
      g.markup_dir = style.direction;
2014
0
    g.markup_lang = FZ_LANG_UNSET;
2015
2016
    // Create root node
2017
0
    tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style);
2018
    // TODO: transfer page margins out of this hacky box
2019
2020
0
    tree->root->tag = ":root";
2021
0
    tree->root->s.layout.em = 0;
2022
0
    tree->root->s.layout.x = 0;
2023
0
    tree->root->s.layout.y = 0;
2024
0
    tree->root->s.layout.w = 0;
2025
0
    tree->root->s.layout.b = 0;
2026
2027
    // Create document node (html).
2028
0
    fz_match_css(ctx, &match, &root_match, g.css, root);
2029
0
    fz_apply_css_style(ctx, g.set, &style, &match);
2030
0
    display = fz_get_css_match_display(&match);
2031
0
    gen2_tag(ctx, &g, tree->root, root, &match, display, &style);
2032
2033
0
    detect_directionality(ctx, g.pool, tree->root);
2034
2035
0
    if (g.is_fb2)
2036
0
    {
2037
0
      node = fz_xml_find(root, "FictionBook");
2038
0
      node = fz_xml_find_down(node, "description");
2039
0
      node = fz_xml_find_down(node, "title-info");
2040
0
      node = fz_xml_find_down(node, "book-title");
2041
0
      if (rtitle)
2042
0
      {
2043
0
        title = fz_xml_text(fz_xml_down(node));
2044
0
        if (title)
2045
0
          *rtitle = fz_pool_strdup(ctx, g.pool, title);
2046
0
      }
2047
0
    }
2048
0
    else
2049
0
    {
2050
0
      node = fz_xml_find(root, "html");
2051
0
      node = fz_xml_find_down(node, "head");
2052
0
      node = fz_xml_find_down(node, "title");
2053
0
      if (rtitle)
2054
0
      {
2055
0
        title = fz_xml_text(fz_xml_down(node));
2056
0
        if (title)
2057
0
          *rtitle = fz_pool_strdup(ctx, g.pool, title);
2058
0
      }
2059
2060
      // Move html or body background-color to :root.
2061
0
      move_background_color_up(ctx, &g, tree->root);
2062
0
    }
2063
0
  }
2064
0
  fz_always(ctx)
2065
0
  {
2066
0
    drop_table_styles(ctx, &g.tab_styles);
2067
0
    fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
2068
0
    fz_drop_css(ctx, g.css);
2069
0
  }
2070
0
  fz_catch(ctx)
2071
0
  {
2072
0
    if (rtitle)
2073
0
    {
2074
0
      fz_free(ctx, *rtitle);
2075
0
      *rtitle = NULL;
2076
0
    }
2077
0
    fz_rethrow(ctx);
2078
0
  }
2079
0
}
2080
2081
static const char *mobi_font_size[7] = {
2082
  "0.67em",
2083
  "0.83em",
2084
  "1em",
2085
  "1.17em",
2086
  "1.33em",
2087
  "1.5em",
2088
  "1.67em",
2089
};
2090
2091
static void
2092
patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node)
2093
0
{
2094
0
  fz_xml *down;
2095
0
  char buf[500];
2096
0
  while (node)
2097
0
  {
2098
0
    char *tag = fz_xml_tag(node);
2099
0
    if (tag)
2100
0
    {
2101
      // Read MOBI attributes, convert to inline CSS style
2102
0
      if (!strcmp(tag, "font"))
2103
0
      {
2104
0
        const char *size = fz_xml_att(node, "size");
2105
0
        if (size)
2106
0
        {
2107
0
          if (!strcmp(size, "1")) size = mobi_font_size[0];
2108
0
          else if (!strcmp(size, "2")) size = mobi_font_size[1];
2109
0
          else if (!strcmp(size, "3")) size = mobi_font_size[2];
2110
0
          else if (!strcmp(size, "4")) size = mobi_font_size[3];
2111
0
          else if (!strcmp(size, "5")) size = mobi_font_size[4];
2112
0
          else if (!strcmp(size, "6")) size = mobi_font_size[5];
2113
0
          else if (!strcmp(size, "7")) size = mobi_font_size[6];
2114
0
          else if (!strcmp(size, "+1")) size = mobi_font_size[3];
2115
0
          else if (!strcmp(size, "+2")) size = mobi_font_size[4];
2116
0
          else if (!strcmp(size, "+3")) size = mobi_font_size[5];
2117
0
          else if (!strcmp(size, "+4")) size = mobi_font_size[6];
2118
0
          else if (!strcmp(size, "+5")) size = mobi_font_size[6];
2119
0
          else if (!strcmp(size, "+6")) size = mobi_font_size[6];
2120
0
          else if (!strcmp(size, "-1")) size = mobi_font_size[1];
2121
0
          else if (!strcmp(size, "-2")) size = mobi_font_size[0];
2122
0
          else if (!strcmp(size, "-3")) size = mobi_font_size[0];
2123
0
          else if (!strcmp(size, "-4")) size = mobi_font_size[0];
2124
0
          else if (!strcmp(size, "-5")) size = mobi_font_size[0];
2125
0
          else if (!strcmp(size, "-6")) size = mobi_font_size[0];
2126
0
          fz_snprintf(buf, sizeof buf, "font-size:%s", size);
2127
0
          fz_xml_add_att(ctx, pool, node, "style", buf);
2128
0
        }
2129
0
      }
2130
0
      else
2131
0
      {
2132
0
        char *height = fz_xml_att(node, "height");
2133
0
        char *width = fz_xml_att(node, "width");
2134
0
        char *align = fz_xml_att(node, "align");
2135
0
        if (height || width || align)
2136
0
        {
2137
0
          buf[0] = 0;
2138
0
          if (height)
2139
0
          {
2140
0
            fz_strlcat(buf, "margin-top:", sizeof buf);
2141
0
            fz_strlcat(buf, height, sizeof buf);
2142
0
            fz_strlcat(buf, ";", sizeof buf);
2143
0
          }
2144
0
          if (width)
2145
0
          {
2146
0
            fz_strlcat(buf, "text-indent:", sizeof buf);
2147
0
            fz_strlcat(buf, width, sizeof buf);
2148
0
            fz_strlcat(buf, ";", sizeof buf);
2149
0
          }
2150
0
          if (align)
2151
0
          {
2152
0
            fz_strlcat(buf, "text-align:", sizeof buf);
2153
0
            fz_strlcat(buf, align, sizeof buf);
2154
0
            fz_strlcat(buf, ";", sizeof buf);
2155
0
          }
2156
0
          fz_xml_add_att(ctx, pool, node, "style", buf);
2157
0
        }
2158
0
        if (!strcmp(tag, "img"))
2159
0
        {
2160
0
          char *recindex = fz_xml_att(node, "recindex");
2161
0
          if (recindex)
2162
0
            fz_xml_add_att(ctx, pool, node, "src", recindex);
2163
0
        }
2164
0
      }
2165
0
    }
2166
2167
0
    down = fz_xml_down(node);
2168
0
    if (down)
2169
0
      patch_mobi_html(ctx, pool, down);
2170
2171
0
    node = fz_xml_next(node);
2172
0
  }
2173
0
}
2174
2175
static void
2176
fz_parse_html_tree(fz_context *ctx,
2177
  fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
2178
  int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi)
2179
0
{
2180
0
  fz_xml_doc *xml;
2181
2182
0
  if (rtitle)
2183
0
    *rtitle = NULL;
2184
2185
0
  xml = parse_to_xml(ctx, buf, try_xml, try_html5);
2186
2187
0
  if (patch_mobi)
2188
0
    patch_mobi_html(ctx, xml->u.doc.pool, fz_xml_root(xml));
2189
2190
0
  fz_try(ctx)
2191
0
    xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi);
2192
0
  fz_always(ctx)
2193
0
    fz_drop_xml(ctx, xml);
2194
0
  fz_catch(ctx)
2195
0
    fz_rethrow(ctx);
2196
0
}
2197
2198
#define fz_new_derived_html_tree(CTX, TYPE, DROP) \
2199
0
 ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE))
2200
2201
static fz_html_tree *
2202
fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop)
2203
0
{
2204
0
  fz_pool *pool = fz_new_pool(ctx);
2205
0
  fz_html_tree *tree;
2206
2207
0
  fz_try(ctx)
2208
0
  {
2209
0
    tree = fz_pool_alloc(ctx, pool, size);
2210
0
    FZ_INIT_STORABLE(tree, 1, drop);
2211
0
    tree->pool = pool;
2212
0
  }
2213
0
  fz_catch(ctx)
2214
0
  {
2215
0
    fz_drop_pool(ctx, pool);
2216
0
    fz_rethrow(ctx);
2217
0
  }
2218
2219
0
  return tree;
2220
0
}
2221
2222
fz_html *
2223
fz_parse_html(fz_context *ctx,
2224
  fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
2225
  int try_xml, int try_html5, int patch_mobi)
2226
0
{
2227
0
  fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp);
2228
2229
0
  html->layout_w = 0;
2230
0
  html->layout_h = 0;
2231
0
  html->layout_em = 0;
2232
2233
0
  fz_try(ctx)
2234
0
    fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi);
2235
0
  fz_catch(ctx)
2236
0
  {
2237
0
    fz_drop_html(ctx, html);
2238
0
    fz_rethrow(ctx);
2239
0
  }
2240
2241
0
  return html;
2242
0
}
2243
2244
typedef struct
2245
{
2246
  int saved;
2247
  fz_warning_cb *old;
2248
  void *arg;
2249
  fz_buffer *buffer;
2250
  fz_context *ctx;
2251
} warning_save;
2252
2253
static void
2254
warn_to_buffer(void *user, const char *message)
2255
0
{
2256
0
  warning_save *save = (warning_save *)user;
2257
0
  fz_context *ctx = save->ctx;
2258
2259
0
  fz_try(ctx)
2260
0
  {
2261
0
    fz_append_string(ctx, save->buffer, message);
2262
0
    fz_append_byte(ctx, save->buffer, '\n');
2263
0
  }
2264
0
  fz_catch(ctx)
2265
0
  {
2266
    /* Silently swallow the error. */
2267
0
    fz_rethrow_if(ctx, FZ_ERROR_SYSTEM);
2268
0
    fz_report_error(ctx);
2269
0
  }
2270
0
}
2271
2272
static void
2273
redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save)
2274
0
{
2275
0
  save->saved = 1;
2276
0
  save->old = fz_warning_callback(ctx, &save->arg);
2277
0
  save->buffer = buf;
2278
0
  save->ctx = ctx;
2279
2280
0
  fz_flush_warnings(ctx);
2281
0
  fz_set_warning_callback(ctx, warn_to_buffer, save);
2282
0
}
2283
2284
static void
2285
restore_warnings(fz_context *ctx, warning_save *save)
2286
0
{
2287
0
  if (!save->saved)
2288
0
    return;
2289
2290
0
  fz_flush_warnings(ctx);
2291
0
  fz_set_warning_callback(ctx, save->old, save->arg);
2292
0
}
2293
2294
fz_story *
2295
fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip)
2296
0
{
2297
0
  fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp);
2298
0
  warning_save saved = { 0 };
2299
0
  fz_buffer *local_buffer = NULL;
2300
2301
0
  if (buf == NULL)
2302
0
  {
2303
0
    local_buffer = fz_new_buffer(ctx, 0);
2304
0
    buf = local_buffer;
2305
0
  }
2306
2307
0
  fz_var(local_buffer);
2308
0
  fz_var(saved);
2309
2310
0
  fz_try(ctx)
2311
0
  {
2312
0
    story->zip = fz_keep_archive(ctx, zip);
2313
0
    story->font_set = fz_new_html_font_set(ctx);
2314
0
    story->em = em;
2315
0
    story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL;
2316
0
    story->warnings = fz_new_buffer(ctx, 128);
2317
0
    redirect_warnings_to_buffer(ctx, story->warnings, &saved);
2318
0
    story->dom = parse_to_xml(ctx, buf, 0, 1);
2319
0
  }
2320
0
  fz_always(ctx)
2321
0
  {
2322
0
    restore_warnings(ctx, &saved);
2323
0
    fz_drop_buffer(ctx, local_buffer);
2324
0
  }
2325
0
  fz_catch(ctx)
2326
0
  {
2327
0
    fz_drop_html_tree(ctx, &story->tree);
2328
0
    fz_rethrow(ctx);
2329
0
  }
2330
2331
0
  return story;
2332
0
}
2333
2334
fz_html *
2335
fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
2336
0
{
2337
  /* try as XML first, fall back to HTML5 */
2338
0
  return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0);
2339
0
}
2340
2341
static void indent(int level)
2342
0
{
2343
0
  while (level-- > 0)
2344
0
    putchar('\t');
2345
0
}
2346
2347
static void
2348
fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level)
2349
0
{
2350
0
  fz_html_box *sbox = NULL;
2351
0
  while (flow)
2352
0
  {
2353
0
    if (flow->box != sbox) {
2354
0
      sbox = flow->box;
2355
0
      indent(level);
2356
0
#ifndef NDEBUG
2357
0
      printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
2358
#else
2359
      printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font));
2360
#endif
2361
0
      if (fz_font_is_serif(ctx, sbox->style->font))
2362
0
        printf(" serif");
2363
0
      else
2364
0
        printf(" sans");
2365
0
      if (fz_font_is_monospaced(ctx, sbox->style->font))
2366
0
        printf(" monospaced");
2367
0
      if (fz_font_is_bold(ctx, sbox->style->font))
2368
0
        printf(" bold");
2369
0
      if (fz_font_is_italic(ctx, sbox->style->font))
2370
0
        printf(" italic");
2371
0
      if (sbox->style->small_caps)
2372
0
        printf(" small-caps");
2373
0
      printf("\n");
2374
0
    }
2375
2376
0
    indent(level);
2377
0
    switch (flow->type) {
2378
0
    case FLOW_WORD: printf("word "); break;
2379
0
    case FLOW_SPACE: printf("space"); break;
2380
0
    case FLOW_SBREAK: printf("sbrk "); break;
2381
0
    case FLOW_SHYPHEN: printf("shy  "); break;
2382
0
    case FLOW_BREAK: printf("break"); break;
2383
0
    case FLOW_IMAGE: printf("image"); break;
2384
0
    case FLOW_ANCHOR: printf("anchor"); break;
2385
0
    }
2386
0
    printf(" script=%d", flow->script);
2387
    // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w);
2388
0
    if (flow->type == FLOW_IMAGE)
2389
0
      printf(" h=%g", flow->h);
2390
0
    if (flow->type == FLOW_WORD)
2391
0
      printf(" text='%s'", flow->content.text);
2392
0
    printf("\n");
2393
0
    if (flow->breaks_line) {
2394
0
      indent(level);
2395
0
      printf("*\n");
2396
0
    }
2397
2398
0
    flow = flow->next;
2399
0
  }
2400
0
}
2401
2402
fz_structure fz_html_tag_to_structure(const char *tag)
2403
0
{
2404
0
  if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT;
2405
0
  if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV;
2406
0
  if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN;
2407
0
  if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE;
2408
0
  if (!strcmp(tag, "p")) return FZ_STRUCTURE_P;
2409
0
  if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1;
2410
0
  if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2;
2411
0
  if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3;
2412
0
  if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4;
2413
0
  if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5;
2414
0
  if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6;
2415
0
  if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST;
2416
0
  if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST;
2417
0
  if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST;
2418
0
  if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM;
2419
0
  if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE;
2420
0
  if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR;
2421
0
  if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH;
2422
0
  if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD;
2423
0
  if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD;
2424
0
  if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY;
2425
0
  if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT;
2426
0
  return FZ_STRUCTURE_INVALID;
2427
0
}
2428
2429
static void fz_debug_css_number(int level, const char *label, fz_css_number number)
2430
0
{
2431
0
  if (number.unit == N_UNDEFINED || number.unit == N_AUTO)
2432
0
    return;
2433
0
  indent(level+1);
2434
0
  printf(">%s: ", label);
2435
0
  switch (number.unit) {
2436
0
  default:
2437
0
  case N_NUMBER: printf("%g (num)\n", number.value); break;
2438
0
  case N_LENGTH: printf("%g (len)\n", number.value); break;
2439
0
  case N_SCALE: printf("%g (scale)\n", number.value); break;
2440
0
  case N_PERCENT: printf("%g%%\n", number.value * 0.01f); break;
2441
0
  }
2442
0
}
2443
2444
static void
2445
fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level)
2446
0
{
2447
0
  while (box)
2448
0
  {
2449
0
    indent(level);
2450
0
    printf("box ");
2451
#ifdef DEBUG_HTML_SEQ
2452
    printf("seq=%d ", box->seq);
2453
#endif
2454
0
    switch (box->type) {
2455
0
    case BOX_BLOCK: printf("block"); break;
2456
0
    case BOX_FLOW: printf("flow"); break;
2457
0
    case BOX_INLINE: printf("inline"); break;
2458
0
    case BOX_TABLE: printf("table"); break;
2459
0
    case BOX_TABLE_ROW: printf("table-row"); break;
2460
0
    case BOX_TABLE_CELL: printf("table-cell"); break;
2461
0
    }
2462
2463
0
    printf(" <%s>", box->tag);
2464
    // printf(" em=%g", box->em);
2465
    // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b);
2466
2467
0
    if (box->markup_dir == FZ_BIDI_RTL)
2468
0
      printf(" rtl");
2469
0
    if (box->is_first_flow)
2470
0
      printf(" is-first-flow");
2471
0
    if (box->list_item)
2472
0
      printf(" list=%d", box->list_item);
2473
0
    if (box->id)
2474
0
      printf(" id=(%s)", box->id);
2475
0
    if (box->href)
2476
0
      printf(" href=(%s)", box->href);
2477
0
    printf("\n");
2478
2479
0
    if (box->type == BOX_BLOCK || box->type == BOX_TABLE || box->type == BOX_TABLE_CELL) {
2480
0
      if (box->style->background_color.a != 0)
2481
0
      {
2482
0
        indent(level+1);
2483
0
        printf(">background-color=#%02x%02x%02x%02x\n",
2484
0
          box->style->background_color.a,
2485
0
          box->style->background_color.r,
2486
0
          box->style->background_color.g,
2487
0
          box->style->background_color.b);
2488
0
      }
2489
0
      if (box->style->position != POS_STATIC)
2490
0
      {
2491
0
        indent(level+1);
2492
0
        printf(">position: %s\n", box->style->position == POS_RELATIVE ? "relative" :
2493
0
          box->style->position == POS_FIXED ? "fixed" : "absolute");
2494
0
      }
2495
0
      fz_debug_css_number(level, "width", box->style->width);
2496
0
      fz_debug_css_number(level, "height", box->style->height);
2497
0
      if (box->u.block.margin[0] != 0 || box->u.block.margin[1] != 0 || box->u.block.margin[2] != 0 || box->u.block.margin[3] != 0)
2498
0
      {
2499
0
        indent(level+1);
2500
0
        printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]);
2501
0
      }
2502
0
      if (box->u.block.border[0] != 0 || box->u.block.border[1] != 0 || box->u.block.border[2] != 0 || box->u.block.border[3] != 0)
2503
0
      {
2504
0
        indent(level+1);
2505
0
        printf(">border=(%g %g %g %g) #%02x%02x%02x%02x\n",
2506
0
          box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3],
2507
0
          box->style->border_color->a, box->style->border_color->r, box->style->border_color->g, box->style->border_color->b);
2508
0
      }
2509
0
      if (box->u.block.padding[0] != 0 || box->u.block.padding[1] != 0 || box->u.block.padding[2] != 0 || box->u.block.padding[3] != 0)
2510
0
      {
2511
0
        indent(level+1);
2512
0
        printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]);
2513
0
      }
2514
0
    }
2515
0
    indent(level+1);
2516
0
    printf(">layout=(%g %g)->(%g %g)\n", box->s.layout.x, box->s.layout.y, box->s.layout.w + box->s.layout.x, box->s.layout.b);
2517
2518
0
    if (box->down)
2519
0
      fz_debug_html_box(ctx, box->down, level + 1);
2520
0
    if (box->type == BOX_FLOW) {
2521
0
      indent(level+1);
2522
0
      printf("flow\n");
2523
0
      fz_debug_html_flow(ctx, box->u.flow.head, level + 2);
2524
0
    }
2525
2526
0
    box = box->next;
2527
0
  }
2528
0
}
2529
2530
void
2531
fz_debug_html(fz_context *ctx, fz_html_box *box)
2532
0
{
2533
0
  fz_debug_html_box(ctx, box, 0);
2534
0
}
2535
2536
static size_t
2537
fz_html_size(fz_context *ctx, fz_html *html)
2538
0
{
2539
0
  return html ? fz_pool_size(ctx, html->tree.pool) : 0;
2540
0
}
2541
2542
/* Magic to make html storable. */
2543
typedef struct {
2544
  int refs;
2545
  void *doc;
2546
  int chapter_num;
2547
} fz_html_key;
2548
2549
static int
2550
fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_)
2551
0
{
2552
0
  fz_html_key *key = (fz_html_key *)key_;
2553
0
  hash->u.pi.ptr = key->doc;
2554
0
  hash->u.pi.i = key->chapter_num;
2555
0
  return 1;
2556
0
}
2557
2558
static void *
2559
fz_keep_html_key(fz_context *ctx, void *key_)
2560
0
{
2561
0
  fz_html_key *key = (fz_html_key *)key_;
2562
0
  return fz_keep_imp(ctx, key, &key->refs);
2563
0
}
2564
2565
static void
2566
fz_drop_html_key(fz_context *ctx, void *key_)
2567
0
{
2568
0
  fz_html_key *key = (fz_html_key *)key_;
2569
0
  if (fz_drop_imp(ctx, key, &key->refs))
2570
0
  {
2571
0
    fz_free(ctx, key);
2572
0
  }
2573
0
}
2574
2575
static int
2576
fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_)
2577
0
{
2578
0
  fz_html_key *k0 = (fz_html_key *)k0_;
2579
0
  fz_html_key *k1 = (fz_html_key *)k1_;
2580
0
  return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num;
2581
0
}
2582
2583
static void
2584
fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_)
2585
0
{
2586
0
  fz_html_key *key = (fz_html_key *)key_;
2587
0
  fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num);
2588
0
}
2589
2590
static const fz_store_type fz_html_store_type =
2591
{
2592
  "fz_html",
2593
  fz_make_hash_html_key,
2594
  fz_keep_html_key,
2595
  fz_drop_html_key,
2596
  fz_cmp_html_key,
2597
  fz_format_html_key,
2598
  NULL
2599
};
2600
2601
fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter)
2602
0
{
2603
0
  fz_html_key *key = NULL;
2604
0
  fz_html *other_html;
2605
2606
  /* Stick the parsed html in the store */
2607
0
  fz_var(key);
2608
2609
0
  fz_try(ctx)
2610
0
  {
2611
0
    key = fz_malloc_struct(ctx, fz_html_key);
2612
0
    key->refs = 1;
2613
0
    key->doc = doc;
2614
0
    key->chapter_num = chapter;
2615
0
    other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type);
2616
0
    if (other_html)
2617
0
    {
2618
0
      fz_drop_html(ctx, html);
2619
0
      html = other_html;
2620
0
    }
2621
0
  }
2622
0
  fz_always(ctx)
2623
0
    fz_drop_html_key(ctx, key);
2624
0
  fz_catch(ctx)
2625
0
  {
2626
    /* Do nothing */
2627
0
  }
2628
2629
0
  return html;
2630
0
}
2631
2632
fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter)
2633
0
{
2634
0
  fz_html_key key;
2635
2636
0
  key.refs = 1;
2637
0
  key.doc = doc;
2638
0
  key.chapter_num = chapter;
2639
0
  return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type);
2640
0
}
2641
2642
static int
2643
html_filter_store(fz_context *ctx, void *doc, void *key_)
2644
0
{
2645
0
  fz_html_key *key = (fz_html_key *)key_;
2646
2647
0
  return (doc == key->doc);
2648
0
}
2649
2650
void fz_purge_stored_html(fz_context *ctx, void *doc)
2651
0
{
2652
0
  fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type);
2653
0
}
2654
2655
static void
2656
convert_to_boxes(fz_context *ctx, fz_story *story)
2657
0
{
2658
0
  warning_save saved = { 0 };
2659
2660
0
  if (story->dom == NULL)
2661
0
    return;
2662
2663
0
  fz_var(saved);
2664
2665
0
  fz_try(ctx)
2666
0
  {
2667
0
    redirect_warnings_to_buffer(ctx, story->warnings, &saved);
2668
0
    xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0);
2669
0
  }
2670
0
  fz_always(ctx)
2671
0
  {
2672
0
    fz_drop_xml(ctx, story->dom);
2673
0
    story->dom = NULL;
2674
0
    restore_warnings(ctx, &saved);
2675
0
  }
2676
0
  fz_catch(ctx)
2677
0
    fz_rethrow(ctx);
2678
0
}
2679
2680
int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled)
2681
0
{
2682
0
  return fz_place_story_flags(ctx, story, where, filled, 0);
2683
0
}
2684
2685
int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags)
2686
0
{
2687
0
  float w, h;
2688
2689
0
  if (filled)
2690
0
    *filled = fz_empty_rect;
2691
2692
0
  if (story == NULL || story->complete)
2693
0
    return 0;
2694
2695
  /* Convert from XML to box model on the first attempt to place.
2696
   * The DOM is unusable from here on in. */
2697
0
  convert_to_boxes(ctx, story);
2698
2699
0
  w = where.x1 - where.x0;
2700
0
  h = where.y1 - where.y0;
2701
  /* Confusingly, we call the layout using restart_draw, not restart_place,
2702
   * because we don't want to destroy the current values in restart_place
2703
   * in case we have to retry later. This means the values are left in
2704
   * the correct struct though! */
2705
0
  story->restart_draw.start = story->restart_place.start;
2706
0
  story->restart_draw.start_flow = story->restart_place.start_flow;
2707
0
  story->restart_draw.start_flags = story->restart_place.start_flags;
2708
0
  story->restart_draw.end = NULL;
2709
0
  story->restart_draw.end_flow = NULL;
2710
0
  story->restart_draw.end_flags = 0;
2711
0
  story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE;
2712
0
  story->restart_draw.flags = flags;
2713
0
  story->bbox = where;
2714
0
  fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw);
2715
0
  story->restart_draw.start = story->restart_place.start;
2716
0
  story->restart_draw.start_flow = story->restart_place.start_flow;
2717
0
  story->restart_draw.start_flags = story->restart_place.start_flags;
2718
2719
0
  if (filled)
2720
0
  {
2721
0
    fz_html_box *b = story->tree.root;
2722
0
    filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L];
2723
0
    filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x;
2724
0
    filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T];
2725
0
    filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B];
2726
0
  }
2727
2728
0
#ifndef NDEBUG
2729
0
  if (fz_atoi(getenv("FZ_DEBUG_HTML")))
2730
0
    fz_debug_html(ctx, story->tree.root);
2731
0
#endif
2732
2733
0
  if (story->restart_draw.end == NULL)
2734
0
    return FZ_HTML_RESTART_REASON_NONE;
2735
0
  if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH)
2736
0
    return FZ_HTML_RESTART_REASON_LINE_WIDTH;
2737
0
  return FZ_HTML_RESTART_REASON_LINE_HEIGHT;
2738
0
}
2739
2740
const char *
2741
fz_story_warnings(fz_context *ctx, fz_story *story)
2742
0
{
2743
0
  unsigned char *data;
2744
2745
0
  if (!story)
2746
0
    return NULL;
2747
2748
0
  convert_to_boxes(ctx, story);
2749
2750
0
  fz_terminate_buffer(ctx, story->warnings);
2751
2752
0
  if (fz_buffer_storage(ctx, story->warnings, &data) == 0)
2753
0
    return NULL;
2754
2755
0
  return (const char *)data;
2756
0
}