/src/mupdf/source/html/html-parse.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2023 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "mupdf/fitz.h" |
24 | | #include "mupdf/ucdn.h" |
25 | | #include "html-imp.h" |
26 | | |
27 | | #include <string.h> |
28 | | #include <stdio.h> |
29 | | #include <assert.h> |
30 | | |
31 | | enum { T, R, B, L }; |
32 | | |
33 | 53 | #define DEFAULT_DIR FZ_BIDI_LTR |
34 | | |
35 | | static const char *html_default_css = |
36 | | "@page{margin:3em 2em}" |
37 | | "a{color:#06C;text-decoration:underline}" |
38 | | "address{display:block;font-style:italic}" |
39 | | "b{font-weight:bold}" |
40 | | "bdo{direction:rtl;unicode-bidi:bidi-override}" |
41 | | "blockquote{display:block;margin:1em 40px}" |
42 | | "body{display:block;margin:1em}" |
43 | | "cite{font-style:italic}" |
44 | | "code{font-family:monospace}" |
45 | | "dd{display:block;margin:0 0 0 40px}" |
46 | | "del{text-decoration:line-through}" |
47 | | "div{display:block}" |
48 | | "dl{display:block;margin:1em 0}" |
49 | | "dt{display:block}" |
50 | | "em{font-style:italic}" |
51 | | "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}" |
52 | | "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}" |
53 | | "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}" |
54 | | "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}" |
55 | | "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}" |
56 | | "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}" |
57 | | "head{display:none}" |
58 | | "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}" |
59 | | "html{display:block}" |
60 | | "i{font-style:italic}" |
61 | | "ins{text-decoration:underline}" |
62 | | "kbd{font-family:monospace}" |
63 | | "li{display:list-item}" |
64 | | "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" |
65 | | "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}" |
66 | | "p{display:block;margin:1em 0}" |
67 | | "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}" |
68 | | "samp{font-family:monospace}" |
69 | | "script{display:none}" |
70 | | "small{font-size:0.83em}" |
71 | | "strong{font-weight:bold}" |
72 | | "style{display:none}" |
73 | | "sub{font-size:0.83em;vertical-align:sub}" |
74 | | "sup{font-size:0.83em;vertical-align:super}" |
75 | | "table{display:table;border-spacing:2px}" |
76 | | "tbody{display:table-row-group}" |
77 | | "td{display:table-cell;padding:1px;background-color:inherit}" |
78 | | "tfoot{display:table-footer-group}" |
79 | | "th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}" |
80 | | "thead{display:table-header-group}" |
81 | | "tr{display:table-row}" |
82 | | "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" |
83 | | "ul ul{list-style-type:circle}" |
84 | | "ul ul ul{list-style-type:square}" |
85 | | "var{font-style:italic}" |
86 | | "colgroup{display:table-column-group}" |
87 | | "col{display:table-column}" |
88 | | "caption{display:block;text-align:center}" |
89 | | ; |
90 | | |
91 | | static const char *mobi_default_css = |
92 | | "pagebreak{display:block;page-break-before:always}" |
93 | | "dl,ol,ul{margin:0}" |
94 | | "p{margin:0}" |
95 | | "blockquote{margin:0 40px}" |
96 | | "center{display:block;text-align:center}" |
97 | | "big{font-size:1.17em}" |
98 | | "strike{text-decoration:line-through}" |
99 | | ; |
100 | | |
101 | | static const char *fb2_default_css = |
102 | | "@page{margin:3em 2em}" |
103 | | "FictionBook{display:block;margin:1em}" |
104 | | "stylesheet,binary{display:none}" |
105 | | "description>*{display:none}" |
106 | | "description>title-info{display:block}" |
107 | | "description>title-info>*{display:none}" |
108 | | "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}" |
109 | | "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}" |
110 | | "image{display:block}" |
111 | | "p>image{display:inline}" |
112 | | "table{display:table}" |
113 | | "tr{display:table-row}" |
114 | | "th,td{display:table-cell}" |
115 | | "a{color:#06C;text-decoration:underline}" |
116 | | "a[type=note]{font-size:small;vertical-align:super}" |
117 | | "code{white-space:pre;font-family:monospace}" |
118 | | "emphasis{font-style:italic}" |
119 | | "strikethrough{text-decoration:line-through}" |
120 | | "strong{font-weight:bold}" |
121 | | "sub{font-size:small;vertical-align:sub}" |
122 | | "sup{font-size:small;vertical-align:super}" |
123 | | "image{margin:1em 0;text-align:center}" |
124 | | "cite,poem{margin:1em 2em}" |
125 | | "subtitle,epigraph,stanza{margin:1em 0}" |
126 | | "title>p{text-align:center;font-size:x-large}" |
127 | | "subtitle{text-align:center;font-size:large}" |
128 | | "p{margin-top:1em;text-align:justify}" |
129 | | "empty-line{padding-top:1em}" |
130 | | "p+p{margin-top:0;text-indent:1.5em}" |
131 | | "empty-line+p{margin-top:0}" |
132 | | "section>title{page-break-before:always}" |
133 | | ; |
134 | | |
135 | | static const char *known_html_tags[] = { |
136 | | // TODO: add known FB2 tags? |
137 | | // Sorted list of all HTML tags. |
138 | | "a", "abbr", "acronym", "address", "annotation-xml", "applet", "area", |
139 | | "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", |
140 | | "bgsound", "big", "blink", "blockquote", "body", "br", "button", |
141 | | "canvas", "caption", "center", "cite", "code", "col", "colgroup", |
142 | | "data", "datalist", "dd", "del", "desc", "details", "dfn", "dir", |
143 | | "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", |
144 | | "font", "footer", "foreignobject", "form", "frame", "frameset", "h1", |
145 | | "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", |
146 | | "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd", |
147 | | "keygen", "label", "legend", "li", "link", "listing", "main", |
148 | | "malignmark", "map", "mark", "marquee", "math", "menu", "menuitem", |
149 | | "meta", "meter", "mglyph", "mi", "mn", "mo", "ms", "mtext", "multicol", |
150 | | "nav", "nextid", "nobr", "noembed", "noframes", "noscript", "object", |
151 | | "ol", "optgroup", "option", "output", "p", "param", "plaintext", "pre", |
152 | | "progress", "q", "rb", "rp", "rt", "rtc", "ruby", "s", "samp", |
153 | | "script", "section", "select", "small", "source", "spacer", "span", |
154 | | "strike", "strong", "style", "sub", "summary", "sup", "svg", "table", |
155 | | "tbody", "td", "template", "textarea", "tfoot", "th", "thead", "time", |
156 | | "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr", "xmp", |
157 | | }; |
158 | | |
159 | | static const char *known_fb2_tags[] = { |
160 | | "FictionBook", "a", "binary", "body", "cite", "code", "coverpage", |
161 | | "date", "description", "emphasis", "empty-line", "epigraph", "image", |
162 | | "p", "poem", "section", "stanza", "strikethrough", "strong", |
163 | | "stylesheet", "sub", "subtitle", "sup", "table", "td", "text-author", |
164 | | "th", "title", "title-info", "tr", "v", |
165 | | }; |
166 | | |
167 | | static const char *find_known_html_tag(const char *tag) |
168 | 564 | { |
169 | 564 | int l = 0; |
170 | 564 | int r = nelem(known_html_tags) / 2 - 1; |
171 | 4.26k | while (l <= r) |
172 | 3.81k | { |
173 | 3.81k | int m = (l + r) >> 1; |
174 | 3.81k | int c = strcmp(tag, known_html_tags[m]); |
175 | 3.81k | if (c < 0) |
176 | 322 | r = m - 1; |
177 | 3.49k | else if (c > 0) |
178 | 3.38k | l = m + 1; |
179 | 112 | else |
180 | 112 | return known_html_tags[m]; |
181 | 3.81k | } |
182 | 452 | return NULL; |
183 | 564 | } |
184 | | |
185 | | static const char *find_known_fb2_tag(const char *tag) |
186 | 0 | { |
187 | 0 | int l = 0; |
188 | 0 | int r = nelem(known_fb2_tags) / 2 - 1; |
189 | 0 | while (l <= r) |
190 | 0 | { |
191 | 0 | int m = (l + r) >> 1; |
192 | 0 | int c = strcmp(tag, known_fb2_tags[m]); |
193 | 0 | if (c < 0) |
194 | 0 | r = m - 1; |
195 | 0 | else if (c > 0) |
196 | 0 | l = m + 1; |
197 | 0 | else |
198 | 0 | return known_fb2_tags[m]; |
199 | 0 | } |
200 | 0 | return NULL; |
201 | 0 | } |
202 | | |
203 | | struct genstate |
204 | | { |
205 | | fz_pool *pool; |
206 | | fz_html_font_set *set; |
207 | | fz_archive *zip; |
208 | | fz_tree *images; |
209 | | fz_xml_doc *xml; |
210 | | int is_fb2; |
211 | | const char *base_uri; |
212 | | fz_css *css; |
213 | | int at_bol; |
214 | | fz_html_box *emit_white; |
215 | | int last_brk_cls; |
216 | | |
217 | | int list_counter; |
218 | | int section_depth; |
219 | | fz_bidi_direction markup_dir; |
220 | | fz_text_language markup_lang; |
221 | | char *href; |
222 | | |
223 | | fz_css_style_splay *styles; |
224 | | }; |
225 | | |
226 | | static int iswhite(int c) |
227 | 22.7k | { |
228 | 22.7k | return c == ' ' || c == '\t' || c == '\r' || c == '\n'; |
229 | 22.7k | } |
230 | | |
231 | | static int is_all_white(const char *s) |
232 | 628 | { |
233 | 1.04k | while (*s) |
234 | 662 | { |
235 | 662 | if (!iswhite(*s)) |
236 | 244 | return 0; |
237 | 418 | ++s; |
238 | 418 | } |
239 | 384 | return 1; |
240 | 628 | } |
241 | | |
242 | | /* TODO: pool allocator for flow nodes */ |
243 | | /* TODO: store text by pointing to a giant buffer */ |
244 | | |
245 | | static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) |
246 | 244 | { |
247 | 4.60k | while (flow) |
248 | 4.36k | { |
249 | 4.36k | fz_html_flow *next = flow->next; |
250 | 4.36k | if (flow->type == FLOW_IMAGE) |
251 | 0 | fz_drop_image(ctx, flow->content.image); |
252 | 4.36k | flow = next; |
253 | 4.36k | } |
254 | 244 | } |
255 | | |
256 | | static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras) |
257 | 4.36k | { |
258 | 4.36k | size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras); |
259 | 4.36k | fz_html_flow *flow; |
260 | | |
261 | | /* Shouldn't happen, but bug 705324. */ |
262 | 4.36k | if (top == NULL || top->type != BOX_FLOW) |
263 | 0 | return NULL; |
264 | | |
265 | 4.36k | flow = fz_pool_alloc(ctx, pool, size); |
266 | 4.36k | flow->type = type; |
267 | 4.36k | flow->expand = 0; |
268 | 4.36k | flow->bidi_level = 0; |
269 | 4.36k | flow->markup_lang = 0; |
270 | 4.36k | flow->breaks_line = 0; |
271 | 4.36k | flow->box = inline_box; |
272 | 4.36k | (*top->s.build.flow_tail) = flow; |
273 | 4.36k | top->s.build.flow_tail = &flow->next; |
274 | 4.36k | return flow; |
275 | 4.36k | } |
276 | | |
277 | | static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
278 | 1.57k | { |
279 | 1.57k | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0); |
280 | 1.57k | if (flow) |
281 | 1.57k | flow->expand = 1; |
282 | 1.57k | } |
283 | | |
284 | | static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
285 | 0 | { |
286 | 0 | (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0); |
287 | 0 | } |
288 | | |
289 | | static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
290 | 484 | { |
291 | 484 | (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0); |
292 | 484 | } |
293 | | |
294 | | static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
295 | 0 | { |
296 | 0 | (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0); |
297 | 0 | } |
298 | | |
299 | | static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang) |
300 | 2.30k | { |
301 | 2.30k | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1); |
302 | 2.30k | if (flow == NULL) |
303 | 0 | return; |
304 | 2.30k | memcpy(flow->content.text, a, b - a); |
305 | 2.30k | flow->content.text[b - a] = 0; |
306 | 2.30k | flow->markup_lang = lang; |
307 | 2.30k | } |
308 | | |
309 | | static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img) |
310 | 0 | { |
311 | 0 | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0); |
312 | 0 | if (flow) |
313 | 0 | flow->content.image = fz_keep_image(ctx, img); |
314 | 0 | } |
315 | | |
316 | | static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
317 | 0 | { |
318 | 0 | (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0); |
319 | 0 | } |
320 | | |
321 | | fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) |
322 | 0 | { |
323 | 0 | fz_html_flow *new_flow; |
324 | 0 | char *text; |
325 | 0 | size_t len; |
326 | |
|
327 | 0 | assert(flow->type == FLOW_WORD); |
328 | | |
329 | 0 | if (offset == 0) |
330 | 0 | return flow; |
331 | 0 | text = flow->content.text; |
332 | 0 | while (*text && offset) |
333 | 0 | { |
334 | 0 | int rune; |
335 | 0 | text += fz_chartorune(&rune, text); |
336 | 0 | offset--; |
337 | 0 | } |
338 | 0 | len = strlen(text); |
339 | 0 | new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1); |
340 | 0 | memcpy(new_flow, flow, offsetof(fz_html_flow, content)); |
341 | 0 | new_flow->next = flow->next; |
342 | 0 | flow->next = new_flow; |
343 | 0 | strcpy(new_flow->content.text, text); |
344 | 0 | *text = 0; |
345 | 0 | return new_flow; |
346 | 0 | } |
347 | | |
348 | | static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g) |
349 | 1.81k | { |
350 | 1.81k | static const char *space = " "; |
351 | 1.81k | fz_pool *pool = g->pool; |
352 | 1.81k | if (g->emit_white) |
353 | 1.78k | { |
354 | 1.78k | int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE; |
355 | 1.78k | if (!g->at_bol) |
356 | 1.57k | { |
357 | 1.57k | if (bsp) |
358 | 1.57k | add_flow_space(ctx, pool, flow, g->emit_white); |
359 | 0 | else |
360 | 0 | add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang); |
361 | 1.57k | } |
362 | 1.78k | g->emit_white = 0; |
363 | 1.78k | } |
364 | 1.81k | } |
365 | | |
366 | | /* pair-wise lookup table for UAX#14 linebreaks */ |
367 | | static const char *pairbrk[29] = |
368 | | { |
369 | | /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */ |
370 | | /* -PLPULSXYSROULLDNYAB2WMJ23LVTI- */ |
371 | | "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */ |
372 | | "_^^%%^^^^%%_____%%__^^^______", /* CL close punctuation */ |
373 | | "_^^%%^^^^%%%%%__%%__^^^______", /* CP close parenthesis */ |
374 | | "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* QU quotation */ |
375 | | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* GL non-breaking glue */ |
376 | | "_^^%%%^^^_______%%__^^^______", /* NS nonstarters */ |
377 | | "_^^%%%^^^______%%%__^^^______", /* EX exclamation/interrogation */ |
378 | | "_^^%%%^^^__%_%__%%__^^^______", /* SY symbols allowing break after */ |
379 | | "_^^%%%^^^__%%%__%%__^^^______", /* IS infix numeric separator */ |
380 | | "%^^%%%^^^__%%%%_%%__^^^%%%%%_", /* PR prefix numeric */ |
381 | | "%^^%%%^^^__%%%__%%__^^^______", /* PO postfix numeric */ |
382 | | "%^^%%%^^^%%%%%_%%%__^^^______", /* NU numeric */ |
383 | | "%^^%%%^^^__%%%_%%%__^^^______", /* AL ordinary alphabetic and symbol characters */ |
384 | | "%^^%%%^^^__%%%_%%%__^^^______", /* HL hebrew letter */ |
385 | | "_^^%%%^^^_%____%%%__^^^______", /* ID ideographic */ |
386 | | "_^^%%%^^^______%%%__^^^______", /* IN inseparable characters */ |
387 | | "_^^%_%^^^__%____%%__^^^______", /* HY hyphens */ |
388 | | "_^^%_%^^^_______%%__^^^______", /* BA break after */ |
389 | | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* BB break before */ |
390 | | "_^^%%%^^^_______%%_^^^^______", /* B2 break opportunity before and after */ |
391 | | "____________________^________", /* ZW zero width space */ |
392 | | "%^^%%%^^^__%%%_%%%__^^^______", /* CM combining mark */ |
393 | | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* WJ word joiner */ |
394 | | "_^^%%%^^^_%____%%%__^^^___%%_", /* H2 hangul leading/vowel syllable */ |
395 | | "_^^%%%^^^_%____%%%__^^^____%_", /* H3 hangul leading/vowel/trailing syllable */ |
396 | | "_^^%%%^^^_%____%%%__^^^%%%%__", /* JL hangul leading jamo */ |
397 | | "_^^%%%^^^_%____%%%__^^^___%%_", /* JV hangul vowel jamo */ |
398 | | "_^^%%%^^^_%____%%%__^^^____%_", /* JT hangul trailing jamo */ |
399 | | "_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */ |
400 | | }; |
401 | | |
402 | | static fz_html_box * |
403 | | find_flow_encloser(fz_context *ctx, fz_html_box *flow) |
404 | 244 | { |
405 | | /* This code was written to assume that there will always be a |
406 | | * flow box enclosing callers of this. Bug 705324 shows that |
407 | | * this isn't always the case. In the absence of a reproducer |
408 | | * file, all I can do is try to patch around the issue so that |
409 | | * we won't crash. */ |
410 | 489 | while (flow->type != BOX_FLOW) |
411 | 245 | { |
412 | 245 | if (flow->up == NULL) |
413 | 0 | { |
414 | 0 | fz_warn(ctx, "Flow encloser not found. Please report this file!"); |
415 | 0 | break; |
416 | 0 | } |
417 | 245 | flow = flow->up; |
418 | 245 | } |
419 | 244 | return flow; |
420 | 244 | } |
421 | | |
422 | | static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g) |
423 | 244 | { |
424 | 244 | fz_html_box *flow; |
425 | 244 | fz_pool *pool = g->pool; |
426 | 244 | int collapse = box->style->white_space & WS_COLLAPSE; |
427 | 244 | int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE; |
428 | 244 | int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE; |
429 | | |
430 | 244 | static const char *space = " "; |
431 | | |
432 | 244 | flow = find_flow_encloser(ctx, box); |
433 | 244 | if (flow == NULL) |
434 | 0 | return; |
435 | | |
436 | 3.67k | while (*text) |
437 | 3.43k | { |
438 | 3.43k | if (bnl && (*text == '\n' || *text == '\r')) |
439 | 0 | { |
440 | 0 | if (text[0] == '\r' && text[1] == '\n') |
441 | 0 | text += 2; |
442 | 0 | else |
443 | 0 | text += 1; |
444 | 0 | add_flow_break(ctx, pool, flow, box); |
445 | 0 | g->at_bol = 1; |
446 | 0 | } |
447 | 3.43k | else if (iswhite(*text)) |
448 | 1.61k | { |
449 | 1.61k | if (collapse) |
450 | 1.61k | { |
451 | 1.61k | if (bnl) |
452 | 0 | while (*text == ' ' || *text == '\t') |
453 | 0 | ++text; |
454 | 1.61k | else |
455 | 3.42k | while (iswhite(*text)) |
456 | 1.81k | ++text; |
457 | 1.61k | g->emit_white = box; |
458 | 1.61k | } |
459 | 0 | else |
460 | 0 | { |
461 | | // TODO: tabs |
462 | 0 | if (bsp) |
463 | 0 | add_flow_space(ctx, pool, flow, box); |
464 | 0 | else |
465 | 0 | add_flow_word(ctx, pool, flow, box, space, space+1, lang); |
466 | 0 | ++text; |
467 | 0 | } |
468 | 1.61k | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */ |
469 | 1.61k | } |
470 | 1.81k | else |
471 | 1.81k | { |
472 | 1.81k | const char *prev, *mark = text; |
473 | 1.81k | int c; |
474 | | |
475 | 1.81k | flush_space(ctx, flow, lang, g); |
476 | | |
477 | 1.81k | if (g->at_bol) |
478 | 244 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; |
479 | | |
480 | 15.4k | while (*text && !iswhite(*text)) |
481 | 13.6k | { |
482 | 13.6k | prev = text; |
483 | 13.6k | text += fz_chartorune(&c, text); |
484 | 13.6k | if (c == 0xAD) /* soft hyphen */ |
485 | 0 | { |
486 | 0 | if (mark != prev) |
487 | 0 | add_flow_word(ctx, pool, flow, box, mark, prev, lang); |
488 | 0 | add_flow_shyphen(ctx, pool, flow, box); |
489 | 0 | mark = text; |
490 | 0 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */ |
491 | 0 | } |
492 | 13.6k | else if (bsp) /* allow soft breaks */ |
493 | 13.6k | { |
494 | 13.6k | int this_brk_cls = ucdn_get_resolved_linebreak_class(c); |
495 | 13.6k | if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI) |
496 | 13.6k | { |
497 | 13.6k | int brk = pairbrk[g->last_brk_cls][this_brk_cls]; |
498 | | |
499 | | /* we handle spaces elsewhere, so ignore these classes */ |
500 | 13.6k | if (brk == '@') brk = '^'; |
501 | 13.6k | if (brk == '#') brk = '^'; |
502 | 13.6k | if (brk == '%') brk = '^'; |
503 | | |
504 | 13.6k | if (brk == '_') |
505 | 484 | { |
506 | 484 | if (mark != prev) |
507 | 484 | add_flow_word(ctx, pool, flow, box, mark, prev, lang); |
508 | 484 | add_flow_sbreak(ctx, pool, flow, box); |
509 | 484 | mark = prev; |
510 | 484 | } |
511 | | |
512 | 13.6k | g->last_brk_cls = this_brk_cls; |
513 | 13.6k | } |
514 | 13.6k | } |
515 | 13.6k | } |
516 | 1.81k | if (mark != text) |
517 | 1.81k | add_flow_word(ctx, pool, flow, box, mark, text, lang); |
518 | | |
519 | 1.81k | g->at_bol = 0; |
520 | 1.81k | } |
521 | 3.43k | } |
522 | 244 | } |
523 | | |
524 | | static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src) |
525 | 0 | { |
526 | 0 | char path[2048]; |
527 | 0 | fz_image *img = NULL; |
528 | 0 | fz_buffer *buf = NULL; |
529 | |
|
530 | 0 | fz_var(img); |
531 | 0 | fz_var(buf); |
532 | |
|
533 | 0 | fz_try(ctx) |
534 | 0 | { |
535 | 0 | if (!strncmp(src, "data:image/jpeg;base64,", 23)) |
536 | 0 | buf = fz_new_buffer_from_base64(ctx, src+23, 0); |
537 | 0 | else if (!strncmp(src, "data:image/png;base64,", 22)) |
538 | 0 | buf = fz_new_buffer_from_base64(ctx, src+22, 0); |
539 | 0 | else if (!strncmp(src, "data:image/gif;base64,", 22)) |
540 | 0 | buf = fz_new_buffer_from_base64(ctx, src+22, 0); |
541 | 0 | else |
542 | 0 | { |
543 | 0 | fz_strlcpy(path, base_uri, sizeof path); |
544 | 0 | fz_strlcat(path, "/", sizeof path); |
545 | 0 | fz_strlcat(path, src, sizeof path); |
546 | 0 | fz_urldecode(path); |
547 | 0 | fz_cleanname(path); |
548 | 0 | buf = fz_read_archive_entry(ctx, zip, path); |
549 | 0 | } |
550 | 0 | #if FZ_ENABLE_SVG |
551 | 0 | if (strstr(src, ".svg")) |
552 | 0 | img = fz_new_image_from_svg(ctx, buf, base_uri, zip); |
553 | 0 | else |
554 | 0 | #endif |
555 | 0 | img = fz_new_image_from_buffer(ctx, buf); |
556 | 0 | } |
557 | 0 | fz_always(ctx) |
558 | 0 | fz_drop_buffer(ctx, buf); |
559 | 0 | fz_catch(ctx) |
560 | 0 | { |
561 | 0 | fz_ignore_error(ctx); |
562 | 0 | fz_warn(ctx, "html: cannot load image src='%s'", src); |
563 | 0 | } |
564 | |
|
565 | 0 | return img; |
566 | 0 | } |
567 | | |
568 | | static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, |
569 | | fz_xml_doc *xmldoc, fz_xml *node) |
570 | 0 | { |
571 | 0 | fz_image *img = NULL; |
572 | 0 | #if FZ_ENABLE_SVG |
573 | 0 | fz_try(ctx) |
574 | 0 | img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip); |
575 | 0 | fz_catch(ctx) |
576 | 0 | { |
577 | 0 | fz_ignore_error(ctx); |
578 | 0 | fz_warn(ctx, "html: cannot load embedded svg document"); |
579 | 0 | } |
580 | 0 | #endif |
581 | 0 | return img; |
582 | 0 | } |
583 | | |
584 | | static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g) |
585 | 0 | { |
586 | 0 | fz_html_box *flow; |
587 | 0 | fz_pool *pool = g->pool; |
588 | |
|
589 | 0 | flow = find_flow_encloser(ctx, box); |
590 | |
|
591 | 0 | flush_space(ctx, flow, 0, g); |
592 | |
|
593 | 0 | if (!img) |
594 | 0 | { |
595 | 0 | const char *alt = "[image]"; |
596 | 0 | add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0); |
597 | 0 | } |
598 | 0 | else |
599 | 0 | { |
600 | 0 | fz_try(ctx) |
601 | 0 | { |
602 | 0 | add_flow_sbreak(ctx, pool, flow, box); |
603 | 0 | add_flow_image(ctx, pool, flow, box, img); |
604 | 0 | add_flow_sbreak(ctx, pool, flow, box); |
605 | 0 | } |
606 | 0 | fz_always(ctx) |
607 | 0 | { |
608 | 0 | fz_drop_image(ctx, img); |
609 | 0 | } |
610 | 0 | fz_catch(ctx) |
611 | 0 | fz_rethrow(ctx); |
612 | 0 | } |
613 | | |
614 | 0 | g->at_bol = 0; |
615 | 0 | } |
616 | | |
617 | | static void fz_drop_html_box(fz_context *ctx, fz_html_box *box) |
618 | 1.10k | { |
619 | 2.15k | while (box) |
620 | 1.04k | { |
621 | 1.04k | fz_html_box *next = box->next; |
622 | 1.04k | if (box->type == BOX_FLOW) |
623 | 244 | fz_drop_html_flow(ctx, box->u.flow.head); |
624 | 1.04k | fz_drop_html_box(ctx, box->down); |
625 | 1.04k | box = next; |
626 | 1.04k | } |
627 | 1.10k | } |
628 | | |
629 | | static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor) |
630 | 14 | { |
631 | 14 | fz_html *html = (fz_html *)stor; |
632 | 14 | fz_drop_html_box(ctx, html->tree.root); |
633 | 14 | fz_drop_pool(ctx, html->tree.pool); |
634 | 14 | } |
635 | | |
636 | | static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor) |
637 | 39 | { |
638 | 39 | fz_story *story = (fz_story *)stor; |
639 | 39 | fz_free(ctx, story->user_css); |
640 | 39 | fz_drop_html_font_set(ctx, story->font_set); |
641 | 39 | fz_drop_xml(ctx, story->dom); |
642 | 39 | fz_drop_html_box(ctx, story->tree.root); |
643 | 39 | fz_drop_buffer(ctx, story->warnings); |
644 | 39 | fz_drop_archive(ctx, story->zip); |
645 | | /* The pool must be the last thing dropped. */ |
646 | 39 | fz_drop_pool(ctx, story->tree.pool); |
647 | 39 | } |
648 | | |
649 | | /* Drop a structure derived from an html_tree. The exact things |
650 | | * freed here will depend upon the drop function with which it |
651 | | * was created. */ |
652 | | static void |
653 | | fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree) |
654 | 107 | { |
655 | 107 | fz_defer_reap_start(ctx); |
656 | 107 | fz_drop_storable(ctx, &tree->storable); |
657 | 107 | fz_defer_reap_end(ctx); |
658 | 107 | } |
659 | | |
660 | | void fz_drop_html(fz_context *ctx, fz_html *html) |
661 | 68 | { |
662 | 68 | fz_drop_html_tree(ctx, &html->tree); |
663 | 68 | } |
664 | | |
665 | | void fz_drop_story(fz_context *ctx, fz_story *story) |
666 | 39 | { |
667 | 39 | if (!story) |
668 | 0 | return; |
669 | | |
670 | 39 | fz_drop_html_tree(ctx, &story->tree); |
671 | 39 | } |
672 | | |
673 | | fz_html *fz_keep_html(fz_context *ctx, fz_html *html) |
674 | 0 | { |
675 | 0 | return fz_keep_storable(ctx, &html->tree.storable); |
676 | 0 | } |
677 | | |
678 | | static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style) |
679 | 1.04k | { |
680 | 1.04k | fz_html_box *box; |
681 | 1.04k | const char *tag = fz_xml_tag(node); |
682 | 1.04k | const char *id = fz_xml_att(node, "id"); |
683 | 1.04k | const char *href; |
684 | | |
685 | 1.04k | if (type == BOX_INLINE) |
686 | 246 | box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u)); |
687 | 803 | else if (type == BOX_FLOW) |
688 | 244 | box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow)); |
689 | 559 | else |
690 | 559 | box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block)); |
691 | | |
692 | 1.04k | box->type = type; |
693 | 1.04k | box->is_first_flow = 0; |
694 | 1.04k | box->markup_dir = g->markup_dir; |
695 | 1.04k | box->heading = 0; |
696 | 1.04k | box->list_item = 0; |
697 | | |
698 | 1.04k | box->style = fz_css_enlist(ctx, style, &g->styles, g->pool); |
699 | | |
700 | 1.04k | if (tag) |
701 | 564 | { |
702 | 564 | box->tag = find_known_html_tag(tag); |
703 | 564 | if (!box->tag && g->is_fb2) |
704 | 0 | box->tag = find_known_fb2_tag(tag); |
705 | 564 | if (!box->tag) |
706 | 452 | box->tag = fz_pool_strdup(ctx, g->pool, tag); |
707 | 564 | } |
708 | 485 | else |
709 | 485 | { |
710 | 485 | box->tag = "#anon"; |
711 | 485 | } |
712 | | |
713 | 1.04k | if (id) |
714 | 4 | box->id = fz_pool_strdup(ctx, g->pool, id); |
715 | | |
716 | 1.04k | if (tag && tag[0]=='a' && tag[1]==0) |
717 | 0 | { |
718 | | // Support deprecated anchor syntax with id in "name" instead of "id" attribute. |
719 | 0 | if (!id) |
720 | 0 | { |
721 | 0 | const char *name = fz_xml_att(node, "name"); |
722 | 0 | if (name) |
723 | 0 | box->id = fz_pool_strdup(ctx, g->pool, name); |
724 | 0 | } |
725 | |
|
726 | 0 | if (g->is_fb2) |
727 | 0 | { |
728 | 0 | href = fz_xml_att(node, "l:href"); |
729 | 0 | if (!href) |
730 | 0 | href = fz_xml_att(node, "xlink:href"); |
731 | 0 | } |
732 | 0 | else |
733 | 0 | { |
734 | 0 | href = fz_xml_att(node, "href"); |
735 | 0 | } |
736 | 0 | if (href) |
737 | 0 | g->href = fz_pool_strdup(ctx, g->pool, href); |
738 | 0 | } |
739 | | |
740 | 1.04k | if (g->href) |
741 | 0 | box->href = g->href; |
742 | | |
743 | 1.04k | if (type == BOX_FLOW) |
744 | 244 | { |
745 | 244 | box->u.flow.head = NULL; |
746 | 244 | box->s.build.flow_tail = &box->u.flow.head; |
747 | 244 | } |
748 | | |
749 | 1.04k | return box; |
750 | 1.04k | } |
751 | | |
752 | | static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child) |
753 | 996 | { |
754 | 996 | child->up = parent; |
755 | 996 | if (!parent->down) |
756 | 638 | parent->down = child; |
757 | 996 | if (parent->s.build.last_child) |
758 | 358 | parent->s.build.last_child->next = child; |
759 | 996 | parent->s.build.last_child = child; |
760 | 996 | } |
761 | | |
762 | | static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box) |
763 | 506 | { |
764 | 506 | while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) |
765 | 0 | box = box->up; |
766 | 506 | return box; |
767 | 506 | } |
768 | | |
769 | | static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box) |
770 | 0 | { |
771 | 0 | fz_html_box *look = box; |
772 | 0 | while (look && look->type != BOX_TABLE) |
773 | 0 | look = look->up; |
774 | 0 | if (look) |
775 | 0 | return look; |
776 | 0 | fz_warn(ctx, "table-row not inside table element"); |
777 | 0 | return NULL; |
778 | 0 | } |
779 | | |
780 | | static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box) |
781 | 0 | { |
782 | 0 | fz_html_box *look = box; |
783 | 0 | while (look && look->type != BOX_TABLE_ROW) |
784 | 0 | look = look->up; |
785 | 0 | if (look) |
786 | 0 | return look; |
787 | 0 | fz_warn(ctx, "table-cell not inside table-row element"); |
788 | 0 | return NULL; |
789 | 0 | } |
790 | | |
791 | | static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box) |
792 | 246 | { |
793 | 246 | fz_css_style style; |
794 | 246 | fz_html_box *flow_box; |
795 | | |
796 | 246 | if (box->type == BOX_FLOW || box->type == BOX_INLINE) |
797 | 2 | return box; |
798 | | |
799 | | // We have an inline element that is not in an existing flow/inline context. |
800 | | |
801 | | // Find the closest block level box to insert content into. |
802 | 244 | while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) |
803 | 0 | box = box->up; |
804 | | |
805 | | // Concatenate onto the last open flow box if we have one. |
806 | 244 | if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW) |
807 | 0 | return box->s.build.last_child; |
808 | | |
809 | | // No flow box found, create and insert one! |
810 | | |
811 | | // TODO: null style instead of default for flow box? |
812 | 244 | fz_default_css_style(ctx, &style); |
813 | 244 | flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style); |
814 | 244 | flow_box->is_first_flow = !box->down; |
815 | 244 | g->at_bol = 1; |
816 | | |
817 | 244 | append_box(ctx, box, flow_box); |
818 | | |
819 | 244 | return flow_box; |
820 | 244 | } |
821 | | |
822 | | static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match); |
823 | | |
824 | | static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) |
825 | 628 | { |
826 | 628 | fz_html_box *anon_box; |
827 | 628 | fz_css_style style; |
828 | 628 | const char *text; |
829 | 628 | int collapse; |
830 | | |
831 | 628 | text = fz_xml_text(node); |
832 | 628 | collapse = root_box->style->white_space & WS_COLLAPSE; |
833 | 628 | if (collapse && is_all_white(text)) |
834 | 384 | { |
835 | 384 | g->emit_white = root_box; |
836 | 384 | } |
837 | 244 | else |
838 | 244 | { |
839 | 244 | if (root_box->type != BOX_INLINE) |
840 | 188 | { |
841 | | /* Create anonymous inline box, with the same style as the top block box. */ |
842 | 188 | style = *root_box->style; |
843 | | |
844 | | // Make sure not to recursively multiply font sizes |
845 | 188 | style.font_size.value = 1; |
846 | 188 | style.font_size.unit = N_SCALE; |
847 | | |
848 | 188 | root_box = find_inline_context(ctx, g, root_box); |
849 | 188 | anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style); |
850 | 188 | append_box(ctx, root_box, anon_box); |
851 | 188 | root_box = anon_box; |
852 | 188 | } |
853 | | |
854 | 244 | generate_text(ctx, root_box, text, g->markup_lang, g); |
855 | 244 | } |
856 | 628 | } |
857 | | |
858 | | static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) |
859 | 58 | { |
860 | 58 | fz_html_box *this_box; |
861 | 58 | fz_html_box *flow_box; |
862 | 58 | root_box = find_inline_context(ctx, g, root_box); |
863 | 58 | this_box = new_box(ctx, g, node, BOX_INLINE, style); |
864 | 58 | append_box(ctx, root_box, this_box); |
865 | 58 | if (this_box->id) |
866 | 0 | { |
867 | 0 | flow_box = find_flow_encloser(ctx, this_box); |
868 | 0 | add_flow_anchor(ctx, g->pool, flow_box, this_box); |
869 | 0 | } |
870 | 58 | return this_box; |
871 | 58 | } |
872 | | |
873 | | static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) |
874 | 0 | { |
875 | 0 | fz_html_box *this_box; |
876 | 0 | fz_html_box *flow_box; |
877 | |
|
878 | 0 | if (root_box->type != BOX_INLINE) |
879 | 0 | { |
880 | | /* Create inline box to hold the <br> tag, with the same style as containing block. */ |
881 | | /* Make sure not to recursively multiply font sizes. */ |
882 | 0 | fz_css_style style = *root_box->style; |
883 | 0 | style.font_size.value = 1; |
884 | 0 | style.font_size.unit = N_SCALE; |
885 | 0 | this_box = new_box(ctx, g, node, BOX_INLINE, &style); |
886 | 0 | append_box(ctx, find_inline_context(ctx, g, root_box), this_box); |
887 | 0 | } |
888 | 0 | else |
889 | 0 | { |
890 | 0 | this_box = root_box; |
891 | 0 | } |
892 | |
|
893 | 0 | flow_box = find_flow_encloser(ctx, this_box); |
894 | 0 | add_flow_break(ctx, g->pool, flow_box, this_box); |
895 | 0 | g->at_bol = 1; |
896 | 0 | } |
897 | | |
898 | | static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) |
899 | 506 | { |
900 | 506 | fz_html_box *this_box; |
901 | 506 | root_box = find_block_context(ctx, root_box); |
902 | 506 | this_box = new_box(ctx, g, node, BOX_BLOCK, style); |
903 | 506 | append_box(ctx, root_box, this_box); |
904 | 506 | return this_box; |
905 | 506 | } |
906 | | |
907 | | static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) |
908 | 0 | { |
909 | 0 | fz_html_box *this_box; |
910 | 0 | root_box = find_block_context(ctx, root_box); |
911 | 0 | this_box = new_box(ctx, g, node, BOX_TABLE, style); |
912 | 0 | append_box(ctx, root_box, this_box); |
913 | 0 | return this_box; |
914 | 0 | } |
915 | | |
916 | | static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) |
917 | 0 | { |
918 | 0 | fz_html_box *this_box, *table_box; |
919 | |
|
920 | 0 | table_box = find_table_row_context(ctx, root_box); |
921 | 0 | if (!table_box) |
922 | 0 | return gen2_block(ctx, g, root_box, node, style); |
923 | | |
924 | 0 | this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style); |
925 | 0 | append_box(ctx, table_box, this_box); |
926 | 0 | return this_box; |
927 | 0 | } |
928 | | |
929 | | static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) |
930 | 0 | { |
931 | 0 | fz_html_box *this_box, *row_box; |
932 | |
|
933 | 0 | row_box = find_table_cell_context(ctx, root_box); |
934 | 0 | if (!row_box) |
935 | 0 | return gen2_block(ctx, g, root_box, node, style); |
936 | | |
937 | 0 | this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style); |
938 | 0 | append_box(ctx, row_box, this_box); |
939 | 0 | return this_box; |
940 | 0 | } |
941 | | |
942 | | static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style) |
943 | 0 | { |
944 | 0 | fz_html_box *img_block_box; |
945 | 0 | fz_html_box *img_inline_box; |
946 | |
|
947 | 0 | if (display == DIS_INLINE || display == DIS_INLINE_BLOCK) |
948 | 0 | { |
949 | 0 | root_box = find_inline_context(ctx, g, root_box); |
950 | 0 | img_inline_box = new_box(ctx, g, node, BOX_INLINE, style); |
951 | 0 | append_box(ctx, root_box, img_inline_box); |
952 | 0 | generate_image(ctx, img_inline_box, img, g); |
953 | 0 | } |
954 | 0 | else |
955 | 0 | { |
956 | 0 | root_box = find_block_context(ctx, root_box); |
957 | 0 | img_block_box = new_box(ctx, g, node, BOX_BLOCK, style); |
958 | 0 | append_box(ctx, root_box, img_block_box); |
959 | |
|
960 | 0 | root_box = find_inline_context(ctx, g, img_block_box); |
961 | 0 | img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style); |
962 | 0 | append_box(ctx, root_box, img_inline_box); |
963 | 0 | generate_image(ctx, img_inline_box, img, g); |
964 | 0 | } |
965 | 0 | } |
966 | | |
967 | | static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) |
968 | 0 | { |
969 | 0 | const char *src = fz_xml_att(node, "src"); |
970 | 0 | if (src) |
971 | 0 | { |
972 | 0 | fz_css_style local_style = *style; |
973 | 0 | fz_image *img; |
974 | 0 | int w, h; |
975 | 0 | const char *w_att = fz_xml_att(node, "width"); |
976 | 0 | const char *h_att = fz_xml_att(node, "height"); |
977 | |
|
978 | 0 | if (w_att && (w = fz_atoi(w_att)) > 0) |
979 | 0 | { |
980 | 0 | local_style.width.value = w; |
981 | 0 | local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH; |
982 | 0 | } |
983 | 0 | if (h_att && (h = fz_atoi(h_att)) > 0) |
984 | 0 | { |
985 | 0 | local_style.height.value = h; |
986 | 0 | local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH; |
987 | 0 | } |
988 | |
|
989 | 0 | img = load_html_image(ctx, g->zip, g->base_uri, src); |
990 | 0 | gen2_image_common(ctx, g, root_box, node, img, display, &local_style); |
991 | 0 | } |
992 | 0 | } |
993 | | |
994 | | static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) |
995 | 0 | { |
996 | 0 | const char *src = fz_xml_att(node, "l:href"); |
997 | 0 | if (!src) |
998 | 0 | src = fz_xml_att(node, "xlink:href"); |
999 | 0 | if (src && src[0] == '#') |
1000 | 0 | { |
1001 | 0 | fz_image *img = fz_tree_lookup(ctx, g->images, src+1); |
1002 | 0 | gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style); |
1003 | 0 | } |
1004 | 0 | } |
1005 | | |
1006 | | static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) |
1007 | 0 | { |
1008 | 0 | fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node); |
1009 | 0 | gen2_image_common(ctx, g, root_box, node, img, display, style); |
1010 | 0 | } |
1011 | | |
1012 | | static int get_heading_from_tag(fz_context *ctx, struct genstate *g, const char *tag) |
1013 | 506 | { |
1014 | 506 | if (tag[0] == 'h' && tag[1] != 0 && tag[2] == 0) |
1015 | 1 | { |
1016 | 1 | switch (tag[1]) |
1017 | 1 | { |
1018 | 1 | case '1': return 1; |
1019 | 0 | case '2': return 2; |
1020 | 0 | case '3': return 3; |
1021 | 0 | case '4': return 4; |
1022 | 0 | case '5': return 5; |
1023 | 0 | case '6': return 6; |
1024 | 1 | } |
1025 | 1 | } |
1026 | 505 | if (g->is_fb2) |
1027 | 0 | { |
1028 | 0 | if (!strcmp(tag, "title") || !strcmp(tag, "subtitle")) |
1029 | 0 | return fz_mini(g->section_depth, 6); |
1030 | 0 | } |
1031 | 505 | return 0; |
1032 | 505 | } |
1033 | | |
1034 | | static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, |
1035 | | fz_css_match *match, int display, fz_css_style *style) |
1036 | 616 | { |
1037 | 616 | fz_html_box *this_box; |
1038 | 616 | const char *tag; |
1039 | 616 | const char *lang_att; |
1040 | 616 | const char *dir_att; |
1041 | | |
1042 | 616 | int save_markup_dir = g->markup_dir; |
1043 | 616 | int save_markup_lang = g->markup_lang; |
1044 | 616 | char *save_href = g->href; |
1045 | | |
1046 | 616 | if (display == DIS_NONE) |
1047 | 52 | return; |
1048 | | |
1049 | 564 | tag = fz_xml_tag(node); |
1050 | | |
1051 | 564 | dir_att = fz_xml_att(node, "dir"); |
1052 | 564 | if (dir_att) |
1053 | 55 | { |
1054 | 55 | if (!strcmp(dir_att, "auto")) |
1055 | 0 | g->markup_dir = FZ_BIDI_NEUTRAL; |
1056 | 55 | else if (!strcmp(dir_att, "rtl")) |
1057 | 0 | g->markup_dir = FZ_BIDI_RTL; |
1058 | 55 | else if (!strcmp(dir_att, "ltr")) |
1059 | 55 | g->markup_dir = FZ_BIDI_LTR; |
1060 | 0 | else |
1061 | 0 | g->markup_dir = DEFAULT_DIR; |
1062 | 55 | } |
1063 | | |
1064 | 564 | lang_att = fz_xml_att(node, "lang"); |
1065 | 564 | if (lang_att) |
1066 | 0 | g->markup_lang = fz_text_language_from_string(lang_att); |
1067 | | |
1068 | 564 | switch (display) |
1069 | 564 | { |
1070 | 0 | case DIS_INLINE_BLOCK: |
1071 | | // TODO handle inline block as a flow node |
1072 | 0 | this_box = gen2_block(ctx, g, root_box, node, style); |
1073 | 0 | break; |
1074 | | |
1075 | 506 | case DIS_BLOCK: |
1076 | 506 | this_box = gen2_block(ctx, g, root_box, node, style); |
1077 | 506 | this_box->heading = get_heading_from_tag(ctx, g, tag); |
1078 | 506 | break; |
1079 | | |
1080 | 0 | case DIS_LIST_ITEM: |
1081 | 0 | this_box = gen2_block(ctx, g, root_box, node, style); |
1082 | 0 | this_box->list_item = ++g->list_counter; |
1083 | 0 | break; |
1084 | | |
1085 | | // TODO: https://www.w3.org/TR/CSS2/tables.html#anonymous-boxes |
1086 | | // |
1087 | | // The table generation code should insert and create anonymous boxes |
1088 | | // for any missing child/parent elements. |
1089 | | // |
1090 | | // MISSING CHILDREN: |
1091 | | // 1: Wrap consecutive BLOCK found in a TABLE in an anon TABLE_ROW. |
1092 | | // 2: Wrap consecutive BLOCK found in a TABLE_ROW in an anon TABLE_CELL. |
1093 | | // |
1094 | | // MISSING PARENTS: |
1095 | | // 1: Wrap consecutive TABLE_CELL found outside TABLE_ROW in an anon TABLE_ROW |
1096 | | // 2: Wrap consecutive TABLE_ROW found outside TABLE in an anon TABLE |
1097 | | // |
1098 | | // For now we ignore this and treat any such elements that are out of |
1099 | | // context as plain block elements. |
1100 | | |
1101 | 0 | case DIS_TABLE: |
1102 | 0 | this_box = gen2_table(ctx, g, root_box, node, style); |
1103 | 0 | break; |
1104 | 0 | case DIS_TABLE_GROUP: |
1105 | | // no box for table-row-group elements |
1106 | 0 | this_box = root_box; |
1107 | 0 | break; |
1108 | 0 | case DIS_TABLE_ROW: |
1109 | 0 | this_box = gen2_table_row(ctx, g, root_box, node, style); |
1110 | 0 | break; |
1111 | 0 | case DIS_TABLE_CELL: |
1112 | 0 | this_box = gen2_table_cell(ctx, g, root_box, node, style); |
1113 | 0 | break; |
1114 | | |
1115 | 58 | case DIS_INLINE: |
1116 | 58 | default: |
1117 | 58 | this_box = gen2_inline(ctx, g, root_box, node, style); |
1118 | 58 | break; |
1119 | 564 | } |
1120 | | |
1121 | 564 | if (tag && !strcmp(tag, "ol")) |
1122 | 0 | { |
1123 | 0 | int save_list_counter = g->list_counter; |
1124 | 0 | g->list_counter = 0; |
1125 | 0 | gen2_children(ctx, g, this_box, node, match); |
1126 | 0 | g->list_counter = save_list_counter; |
1127 | 0 | } |
1128 | 564 | else if (tag && !strcmp(tag, "section")) |
1129 | 0 | { |
1130 | 0 | int save_section_depth = g->section_depth; |
1131 | 0 | g->section_depth++; |
1132 | 0 | gen2_children(ctx, g, this_box, node, match); |
1133 | 0 | g->section_depth = save_section_depth; |
1134 | 0 | } |
1135 | 564 | else |
1136 | 564 | { |
1137 | 564 | gen2_children(ctx, g, this_box, node, match); |
1138 | 564 | } |
1139 | | |
1140 | 564 | g->markup_dir = save_markup_dir; |
1141 | 564 | g->markup_lang = save_markup_lang; |
1142 | 564 | g->href = save_href; |
1143 | 564 | } |
1144 | | |
1145 | | static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match) |
1146 | 564 | { |
1147 | 564 | fz_xml *node; |
1148 | 564 | const char *tag; |
1149 | 564 | fz_css_match match; |
1150 | 564 | fz_css_style style; |
1151 | 564 | int display; |
1152 | | |
1153 | 1.75k | for (node = fz_xml_down(root_node); node; node = fz_xml_next(node)) |
1154 | 1.19k | { |
1155 | 1.19k | tag = fz_xml_tag(node); |
1156 | 1.19k | if (tag) |
1157 | 563 | { |
1158 | 563 | fz_match_css(ctx, &match, root_match, g->css, node); |
1159 | 563 | fz_apply_css_style(ctx, g->set, &style, &match); |
1160 | 563 | display = fz_get_css_match_display(&match); |
1161 | 563 | if (tag[0]=='b' && tag[1]=='r' && tag[2]==0) |
1162 | 0 | { |
1163 | 0 | gen2_break(ctx, g, root_box, node); |
1164 | 0 | } |
1165 | 563 | else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0) |
1166 | 0 | { |
1167 | 0 | gen2_image_html(ctx, g, root_box, node, display, &style); |
1168 | 0 | } |
1169 | 563 | else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0) |
1170 | 0 | { |
1171 | 0 | gen2_image_fb2(ctx, g, root_box, node, display, &style); |
1172 | 0 | } |
1173 | 563 | else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0) |
1174 | 0 | { |
1175 | 0 | gen2_image_svg(ctx, g, root_box, node, display, &style); |
1176 | 0 | } |
1177 | 563 | else |
1178 | 563 | { |
1179 | 563 | gen2_tag(ctx, g, root_box, node, &match, display, &style); |
1180 | 563 | } |
1181 | 563 | } |
1182 | 628 | else |
1183 | 628 | { |
1184 | 628 | gen2_text(ctx, g, root_box, node); |
1185 | 628 | } |
1186 | 1.19k | } |
1187 | 564 | } |
1188 | | |
1189 | | static char *concat_text(fz_context *ctx, fz_xml *root) |
1190 | 0 | { |
1191 | 0 | fz_xml *node; |
1192 | 0 | size_t i = 0, n = 1; |
1193 | 0 | char *s; |
1194 | 0 | for (node = fz_xml_down(root); node; node = fz_xml_next(node)) |
1195 | 0 | { |
1196 | 0 | const char *text = fz_xml_text(node); |
1197 | 0 | n += text ? strlen(text) : 0; |
1198 | 0 | } |
1199 | 0 | s = Memento_label(fz_malloc(ctx, n), "concat_html"); |
1200 | 0 | for (node = fz_xml_down(root); node; node = fz_xml_next(node)) |
1201 | 0 | { |
1202 | 0 | const char *text = fz_xml_text(node); |
1203 | 0 | if (text) |
1204 | 0 | { |
1205 | 0 | n = strlen(text); |
1206 | 0 | memcpy(s+i, text, n); |
1207 | 0 | i += n; |
1208 | 0 | } |
1209 | 0 | } |
1210 | 0 | s[i] = 0; |
1211 | 0 | return s; |
1212 | 0 | } |
1213 | | |
1214 | | static void |
1215 | | html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href) |
1216 | 0 | { |
1217 | 0 | char path[2048]; |
1218 | 0 | char css_base_uri[2048]; |
1219 | 0 | fz_buffer *buf; |
1220 | |
|
1221 | 0 | fz_var(buf); |
1222 | |
|
1223 | 0 | fz_strlcpy(path, base_uri, sizeof path); |
1224 | 0 | fz_strlcat(path, "/", sizeof path); |
1225 | 0 | fz_strlcat(path, href, sizeof path); |
1226 | 0 | fz_urldecode(path); |
1227 | 0 | fz_cleanname(path); |
1228 | |
|
1229 | 0 | fz_dirname(css_base_uri, path, sizeof css_base_uri); |
1230 | |
|
1231 | 0 | buf = NULL; |
1232 | 0 | fz_try(ctx) |
1233 | 0 | { |
1234 | 0 | buf = fz_read_archive_entry(ctx, zip, path); |
1235 | 0 | fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path); |
1236 | 0 | fz_add_css_font_faces(ctx, set, zip, css_base_uri, css); |
1237 | 0 | } |
1238 | 0 | fz_always(ctx) |
1239 | 0 | fz_drop_buffer(ctx, buf); |
1240 | 0 | fz_catch(ctx) |
1241 | 0 | { |
1242 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
1243 | 0 | fz_report_error(ctx); |
1244 | 0 | fz_warn(ctx, "ignoring stylesheet %s", path); |
1245 | 0 | } |
1246 | 0 | } |
1247 | | |
1248 | | static void |
1249 | | html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) |
1250 | 53 | { |
1251 | 53 | fz_xml *html, *head, *node; |
1252 | | |
1253 | 53 | html = fz_xml_find(root, "html"); |
1254 | 53 | head = fz_xml_find_down(html, "head"); |
1255 | 65 | for (node = fz_xml_down(head); node; node = fz_xml_next(node)) |
1256 | 12 | { |
1257 | 12 | if (fz_xml_is_tag(node, "link")) |
1258 | 0 | { |
1259 | 0 | char *rel = fz_xml_att(node, "rel"); |
1260 | 0 | if (rel && !fz_strcasecmp(rel, "stylesheet")) |
1261 | 0 | { |
1262 | 0 | char *type = fz_xml_att(node, "type"); |
1263 | 0 | if ((type && !strcmp(type, "text/css")) || !type) |
1264 | 0 | { |
1265 | 0 | char *href = fz_xml_att(node, "href"); |
1266 | 0 | if (href) |
1267 | 0 | { |
1268 | 0 | html_load_css_link(ctx, set, zip, base_uri, css, root, href); |
1269 | 0 | } |
1270 | 0 | } |
1271 | 0 | } |
1272 | 0 | } |
1273 | 12 | else if (fz_xml_is_tag(node, "style")) |
1274 | 0 | { |
1275 | 0 | char *s = concat_text(ctx, node); |
1276 | 0 | fz_try(ctx) |
1277 | 0 | { |
1278 | 0 | fz_parse_css(ctx, css, s, "<style>"); |
1279 | 0 | fz_add_css_font_faces(ctx, set, zip, base_uri, css); |
1280 | 0 | } |
1281 | 0 | fz_always(ctx) |
1282 | 0 | fz_free(ctx, s); |
1283 | 0 | fz_catch(ctx) |
1284 | 0 | { |
1285 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
1286 | 0 | fz_report_error(ctx); |
1287 | 0 | fz_warn(ctx, "ignoring inline stylesheet"); |
1288 | 0 | } |
1289 | 0 | } |
1290 | 12 | } |
1291 | 53 | } |
1292 | | |
1293 | | static void |
1294 | | fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) |
1295 | 0 | { |
1296 | 0 | fz_xml *fictionbook, *stylesheet; |
1297 | |
|
1298 | 0 | fictionbook = fz_xml_find(root, "FictionBook"); |
1299 | 0 | stylesheet = fz_xml_find_down(fictionbook, "stylesheet"); |
1300 | 0 | if (stylesheet) |
1301 | 0 | { |
1302 | 0 | char *s = concat_text(ctx, stylesheet); |
1303 | 0 | fz_try(ctx) |
1304 | 0 | { |
1305 | 0 | fz_parse_css(ctx, css, s, "<stylesheet>"); |
1306 | 0 | fz_add_css_font_faces(ctx, set, zip, base_uri, css); |
1307 | 0 | } |
1308 | 0 | fz_catch(ctx) |
1309 | 0 | { |
1310 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
1311 | 0 | fz_report_error(ctx); |
1312 | 0 | fz_warn(ctx, "ignoring inline stylesheet"); |
1313 | 0 | } |
1314 | 0 | fz_free(ctx, s); |
1315 | 0 | } |
1316 | 0 | } |
1317 | | |
1318 | | static fz_tree * |
1319 | | load_fb2_images(fz_context *ctx, fz_xml *root) |
1320 | 0 | { |
1321 | 0 | fz_xml *fictionbook, *binary; |
1322 | 0 | fz_tree *images = NULL; |
1323 | |
|
1324 | 0 | fictionbook = fz_xml_find(root, "FictionBook"); |
1325 | 0 | for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary")) |
1326 | 0 | { |
1327 | 0 | const char *id = fz_xml_att(binary, "id"); |
1328 | 0 | char *b64 = NULL; |
1329 | 0 | fz_buffer *buf = NULL; |
1330 | 0 | fz_image *img = NULL; |
1331 | |
|
1332 | 0 | fz_var(b64); |
1333 | 0 | fz_var(buf); |
1334 | |
|
1335 | 0 | if (id == NULL) |
1336 | 0 | { |
1337 | 0 | fz_warn(ctx, "Skipping image with no id"); |
1338 | 0 | continue; |
1339 | 0 | } |
1340 | | |
1341 | 0 | fz_try(ctx) |
1342 | 0 | { |
1343 | 0 | b64 = concat_text(ctx, binary); |
1344 | 0 | buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64)); |
1345 | 0 | img = fz_new_image_from_buffer(ctx, buf); |
1346 | 0 | } |
1347 | 0 | fz_always(ctx) |
1348 | 0 | { |
1349 | 0 | fz_drop_buffer(ctx, buf); |
1350 | 0 | fz_free(ctx, b64); |
1351 | 0 | } |
1352 | 0 | fz_catch(ctx) |
1353 | 0 | fz_rethrow(ctx); |
1354 | | |
1355 | 0 | images = fz_tree_insert(ctx, images, id, img); |
1356 | 0 | } |
1357 | | |
1358 | 0 | return images; |
1359 | 0 | } |
1360 | | |
1361 | | typedef struct |
1362 | | { |
1363 | | uint32_t *data; |
1364 | | size_t cap; |
1365 | | size_t len; |
1366 | | } uni_buf; |
1367 | | |
1368 | | typedef struct |
1369 | | { |
1370 | | fz_context *ctx; |
1371 | | fz_pool *pool; |
1372 | | fz_html_flow *flow; |
1373 | | uni_buf *buffer; |
1374 | | } bidi_data; |
1375 | | |
1376 | | static void fragment_cb(const uint32_t *fragment, |
1377 | | size_t fragment_len, |
1378 | | int bidi_level, |
1379 | | int script, |
1380 | | void *arg) |
1381 | 244 | { |
1382 | 244 | bidi_data *data = (bidi_data *)arg; |
1383 | | |
1384 | | /* We are guaranteed that fragmentOffset will be at the beginning |
1385 | | * of flow. */ |
1386 | 4.60k | while (fragment_len > 0) |
1387 | 4.36k | { |
1388 | 4.36k | size_t len; |
1389 | | |
1390 | 4.36k | if (data->flow->type == FLOW_SPACE) |
1391 | 1.57k | { |
1392 | 1.57k | len = 1; |
1393 | 1.57k | } |
1394 | 2.78k | else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK || |
1395 | 2.78k | data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR) |
1396 | 484 | { |
1397 | 484 | len = 0; |
1398 | 484 | } |
1399 | 2.30k | else |
1400 | 2.30k | { |
1401 | | /* Must be text */ |
1402 | 2.30k | len = fz_utflen(data->flow->content.text); |
1403 | 2.30k | if (len > fragment_len) |
1404 | 0 | { |
1405 | | /* We need to split this flow box */ |
1406 | 0 | (void)fz_html_split_flow(data->ctx, data->pool, data->flow, fragment_len); |
1407 | 0 | len = fz_utflen(data->flow->content.text); |
1408 | 0 | } |
1409 | 2.30k | } |
1410 | | |
1411 | | /* This flow box is entirely contained within this fragment. */ |
1412 | 4.36k | data->flow->bidi_level = bidi_level; |
1413 | 4.36k | data->flow->script = script; |
1414 | 4.36k | data->flow = data->flow->next; |
1415 | 4.36k | fragment_len -= len; |
1416 | 4.36k | } |
1417 | 244 | } |
1418 | | |
1419 | | static fz_bidi_direction |
1420 | | detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow) |
1421 | 244 | { |
1422 | 244 | fz_html_flow *end = flow; |
1423 | 244 | bidi_data data; |
1424 | | |
1425 | 488 | while (end) |
1426 | 244 | { |
1427 | 244 | int level = end->bidi_level; |
1428 | | |
1429 | | /* Gather the text from the flow up into a single buffer (at |
1430 | | * least, as much of it as has the same direction markup). */ |
1431 | 244 | buffer->len = 0; |
1432 | 4.60k | while (end && (level & 1) == (end->bidi_level & 1)) |
1433 | 4.36k | { |
1434 | 4.36k | size_t len = 0; |
1435 | 4.36k | const char *text = ""; |
1436 | 4.36k | int broken = 0; |
1437 | | |
1438 | 4.36k | switch (end->type) |
1439 | 4.36k | { |
1440 | 2.30k | case FLOW_WORD: |
1441 | 2.30k | len = fz_utflen(end->content.text); |
1442 | 2.30k | text = end->content.text; |
1443 | 2.30k | break; |
1444 | 1.57k | case FLOW_SPACE: |
1445 | 1.57k | len = 1; |
1446 | 1.57k | text = " "; |
1447 | 1.57k | break; |
1448 | 0 | case FLOW_SHYPHEN: |
1449 | 484 | case FLOW_SBREAK: |
1450 | 484 | break; |
1451 | 0 | case FLOW_BREAK: |
1452 | 0 | case FLOW_IMAGE: |
1453 | 0 | broken = 1; |
1454 | 0 | break; |
1455 | 4.36k | } |
1456 | | |
1457 | 4.36k | end = end->next; |
1458 | | |
1459 | 4.36k | if (broken) |
1460 | 0 | break; |
1461 | | |
1462 | | /* Make sure the buffer is large enough */ |
1463 | 4.36k | if (buffer->len + len > buffer->cap) |
1464 | 60 | { |
1465 | 60 | size_t newcap = buffer->cap; |
1466 | 60 | if (newcap < 128) |
1467 | 47 | newcap = 128; /* Sensible small default */ |
1468 | | |
1469 | 73 | while (newcap < buffer->len + len) |
1470 | 13 | newcap = (newcap * 3) / 2; |
1471 | | |
1472 | 60 | buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t); |
1473 | 60 | buffer->cap = newcap; |
1474 | 60 | } |
1475 | | |
1476 | | /* Expand the utf8 text into Unicode and store it in the buffer */ |
1477 | 19.5k | while (*text) |
1478 | 15.2k | { |
1479 | 15.2k | int rune; |
1480 | 15.2k | text += fz_chartorune(&rune, text); |
1481 | 15.2k | buffer->data[buffer->len++] = rune; |
1482 | 15.2k | } |
1483 | 4.36k | } |
1484 | | |
1485 | | /* Detect directionality for the buffer */ |
1486 | 244 | data.ctx = ctx; |
1487 | 244 | data.pool = pool; |
1488 | 244 | data.flow = flow; |
1489 | 244 | data.buffer = buffer; |
1490 | 244 | fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */); |
1491 | 244 | flow = end; |
1492 | 244 | } |
1493 | 244 | return bidi_dir; |
1494 | 244 | } |
1495 | | |
1496 | | static void |
1497 | | detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box) |
1498 | 1.10k | { |
1499 | 2.15k | while (box) |
1500 | 1.04k | { |
1501 | 1.04k | if (box->type == BOX_FLOW) |
1502 | 244 | box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->u.flow.head); |
1503 | 1.04k | detect_box_directionality(ctx, pool, buffer, box->down); |
1504 | 1.04k | box = box->next; |
1505 | 1.04k | } |
1506 | 1.10k | } |
1507 | | |
1508 | | static void |
1509 | | detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box) |
1510 | 53 | { |
1511 | 53 | uni_buf buffer = { NULL }; |
1512 | | |
1513 | 106 | fz_try(ctx) |
1514 | 106 | detect_box_directionality(ctx, pool, &buffer, box); |
1515 | 106 | fz_always(ctx) |
1516 | 53 | fz_free(ctx, buffer.data); |
1517 | 53 | fz_catch(ctx) |
1518 | 0 | fz_rethrow(ctx); |
1519 | 53 | } |
1520 | | |
1521 | | static fz_xml_doc * |
1522 | | parse_to_xml(fz_context *ctx, fz_buffer *buf, int try_xml, int try_html5) |
1523 | 53 | { |
1524 | 53 | fz_xml_doc *xml; |
1525 | | |
1526 | 53 | if (try_xml && try_html5) |
1527 | 0 | { |
1528 | 0 | fz_try(ctx) |
1529 | 0 | xml = fz_parse_xml(ctx, buf, 1); |
1530 | 0 | fz_catch(ctx) |
1531 | 0 | { |
1532 | 0 | if (fz_caught(ctx) == FZ_ERROR_SYNTAX) |
1533 | 0 | { |
1534 | 0 | fz_report_error(ctx); |
1535 | 0 | fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser"); |
1536 | 0 | xml = fz_parse_xml_from_html5(ctx, buf); |
1537 | 0 | } |
1538 | 0 | else |
1539 | 0 | fz_rethrow(ctx); |
1540 | 0 | } |
1541 | 0 | } |
1542 | 53 | else if (try_xml) |
1543 | 0 | xml = fz_parse_xml(ctx, buf, 1); |
1544 | 53 | else |
1545 | 53 | { |
1546 | 53 | assert(try_html5); |
1547 | 53 | xml = fz_parse_xml_from_html5(ctx, buf); |
1548 | 53 | } |
1549 | | |
1550 | 53 | return xml; |
1551 | 53 | } |
1552 | | |
1553 | | static void move_background_color_style_up(fz_context *ctx, struct genstate *g, fz_html_box *root, fz_html_box *from) |
1554 | 0 | { |
1555 | 0 | fz_css_color transparent = { 0, 0, 0, 0 }; |
1556 | 0 | fz_css_style s1, s2; |
1557 | 0 | memcpy(&s1, root->style, sizeof s1); |
1558 | 0 | memcpy(&s2, from->style, sizeof s2); |
1559 | 0 | s1.background_color = s2.background_color; |
1560 | 0 | s2.background_color = transparent; |
1561 | 0 | root->style = fz_css_enlist(ctx, &s1, &g->styles, g->pool); |
1562 | 0 | from->style = fz_css_enlist(ctx, &s2, &g->styles, g->pool); |
1563 | 0 | } |
1564 | | |
1565 | | static void move_background_color_up(fz_context *ctx, struct genstate *g, fz_html_box *root) |
1566 | 53 | { |
1567 | 53 | fz_html_box *html, *body; |
1568 | | |
1569 | 53 | if (root->style->background_color.a != 0) |
1570 | 0 | { |
1571 | 0 | return; |
1572 | 0 | } |
1573 | | |
1574 | 53 | html = root->down; |
1575 | 53 | if (html && !strcmp(html->tag, "html")) |
1576 | 52 | { |
1577 | 52 | if (html->style->background_color.a != 0) |
1578 | 0 | { |
1579 | 0 | move_background_color_style_up(ctx, g, root, html); |
1580 | 0 | return; |
1581 | 0 | } |
1582 | | |
1583 | 52 | body = html->down; |
1584 | 52 | if (body && !strcmp(body->tag, "body")) |
1585 | 52 | { |
1586 | 52 | if (body->style->background_color.a != 0) |
1587 | 0 | { |
1588 | 0 | move_background_color_style_up(ctx, g, root, body); |
1589 | 0 | return; |
1590 | 0 | } |
1591 | 52 | } |
1592 | 52 | } |
1593 | 53 | } |
1594 | | |
1595 | | static void |
1596 | | xml_to_boxes(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, const char *user_css, |
1597 | | fz_xml_doc *xml, fz_html_tree *tree, char **rtitle, int try_fictionbook, int is_mobi) |
1598 | 53 | { |
1599 | 53 | fz_xml *root, *node; |
1600 | 53 | char *title; |
1601 | | |
1602 | 53 | fz_css_match root_match, match; |
1603 | 53 | struct genstate g = {0}; |
1604 | | |
1605 | 53 | g.pool = NULL; |
1606 | 53 | g.set = set; |
1607 | 53 | g.zip = zip; |
1608 | 53 | g.images = NULL; |
1609 | 53 | g.xml = xml; |
1610 | 53 | g.is_fb2 = 0; |
1611 | 53 | g.base_uri = base_uri; |
1612 | 53 | g.css = NULL; |
1613 | 53 | g.at_bol = 0; |
1614 | 53 | g.emit_white = 0; |
1615 | 53 | g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP; |
1616 | 53 | g.list_counter = 0; |
1617 | 53 | g.section_depth = 0; |
1618 | 53 | g.markup_dir = FZ_BIDI_LTR; |
1619 | 53 | g.markup_lang = FZ_LANG_UNSET; |
1620 | 53 | g.href = NULL; |
1621 | 53 | g.styles = NULL; |
1622 | | |
1623 | 53 | if (rtitle) |
1624 | 14 | *rtitle = NULL; |
1625 | | |
1626 | 53 | root = fz_xml_root(g.xml); |
1627 | 53 | g.css = fz_new_css(ctx); |
1628 | | |
1629 | 53 | #ifndef NDEBUG |
1630 | 53 | if (fz_atoi(getenv("FZ_DEBUG_XML"))) |
1631 | 0 | fz_debug_xml(root, 0); |
1632 | 53 | #endif |
1633 | | |
1634 | 106 | fz_try(ctx) |
1635 | 106 | { |
1636 | 53 | if (try_fictionbook && fz_xml_find(root, "FictionBook")) |
1637 | 0 | { |
1638 | 0 | g.is_fb2 = 1; |
1639 | 0 | fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>"); |
1640 | 0 | if (fz_use_document_css(ctx)) |
1641 | 0 | fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); |
1642 | 0 | g.images = load_fb2_images(ctx, root); |
1643 | 0 | } |
1644 | 53 | else if (is_mobi) |
1645 | 0 | { |
1646 | 0 | g.is_fb2 = 0; |
1647 | 0 | fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); |
1648 | 0 | fz_parse_css(ctx, g.css, mobi_default_css, "<default:mobi>"); |
1649 | 0 | if (fz_use_document_css(ctx)) |
1650 | 0 | html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); |
1651 | 0 | } |
1652 | 53 | else |
1653 | 53 | { |
1654 | 53 | g.is_fb2 = 0; |
1655 | 53 | fz_parse_css(ctx, g.css, html_default_css, "<default:html>"); |
1656 | 53 | if (fz_use_document_css(ctx)) |
1657 | 53 | html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root); |
1658 | 53 | } |
1659 | | |
1660 | 53 | if (user_css) |
1661 | 39 | { |
1662 | 39 | fz_parse_css(ctx, g.css, user_css, "<user>"); |
1663 | 39 | fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css); |
1664 | 39 | } |
1665 | 53 | } |
1666 | 106 | fz_catch(ctx) |
1667 | 1 | { |
1668 | 1 | fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); |
1669 | 1 | fz_drop_css(ctx, g.css); |
1670 | 1 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
1671 | 1 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
1672 | 1 | fz_report_error(ctx); |
1673 | 1 | fz_warn(ctx, "ignoring styles"); |
1674 | 1 | g.css = fz_new_css(ctx); |
1675 | 1 | g.images = NULL; |
1676 | 1 | } |
1677 | | |
1678 | 53 | #ifndef NDEBUG |
1679 | 53 | if (fz_atoi(getenv("FZ_DEBUG_CSS"))) |
1680 | 0 | fz_debug_css(ctx, g.css); |
1681 | 53 | #endif |
1682 | | |
1683 | 106 | fz_try(ctx) |
1684 | 106 | { |
1685 | 53 | fz_css_style style; |
1686 | 53 | int display; |
1687 | | |
1688 | 53 | fz_match_css_at_page(ctx, &root_match, g.css); |
1689 | 53 | fz_apply_css_style(ctx, g.set, &style, &root_match); |
1690 | | |
1691 | 53 | g.pool = tree->pool; |
1692 | 53 | g.markup_dir = DEFAULT_DIR; |
1693 | 53 | g.markup_lang = FZ_LANG_UNSET; |
1694 | | |
1695 | | // Create root node |
1696 | 53 | tree->root = new_box(ctx, &g, NULL, BOX_BLOCK, &style); |
1697 | | // TODO: transfer page margins out of this hacky box |
1698 | | |
1699 | 53 | tree->root->tag = ":root"; |
1700 | 53 | tree->root->s.layout.em = 0; |
1701 | 53 | tree->root->s.layout.x = 0; |
1702 | 53 | tree->root->s.layout.y = 0; |
1703 | 53 | tree->root->s.layout.w = 0; |
1704 | 53 | tree->root->s.layout.b = 0; |
1705 | | |
1706 | | // Create document node (html). |
1707 | 53 | fz_match_css(ctx, &match, &root_match, g.css, root); |
1708 | 53 | fz_apply_css_style(ctx, g.set, &style, &match); |
1709 | 53 | display = fz_get_css_match_display(&match); |
1710 | 53 | gen2_tag(ctx, &g, tree->root, root, &match, display, &style); |
1711 | | |
1712 | 53 | detect_directionality(ctx, g.pool, tree->root); |
1713 | | |
1714 | 53 | if (g.is_fb2) |
1715 | 0 | { |
1716 | 0 | node = fz_xml_find(root, "FictionBook"); |
1717 | 0 | node = fz_xml_find_down(node, "description"); |
1718 | 0 | node = fz_xml_find_down(node, "title-info"); |
1719 | 0 | node = fz_xml_find_down(node, "book-title"); |
1720 | 0 | if (rtitle) |
1721 | 0 | { |
1722 | 0 | title = fz_xml_text(fz_xml_down(node)); |
1723 | 0 | if (title) |
1724 | 0 | *rtitle = fz_pool_strdup(ctx, g.pool, title); |
1725 | 0 | } |
1726 | 0 | } |
1727 | 53 | else |
1728 | 53 | { |
1729 | 53 | node = fz_xml_find(root, "html"); |
1730 | 53 | node = fz_xml_find_down(node, "head"); |
1731 | 53 | node = fz_xml_find_down(node, "title"); |
1732 | 53 | if (rtitle) |
1733 | 14 | { |
1734 | 14 | title = fz_xml_text(fz_xml_down(node)); |
1735 | 14 | if (title) |
1736 | 3 | *rtitle = fz_pool_strdup(ctx, g.pool, title); |
1737 | 14 | } |
1738 | | |
1739 | | // Move html or body background-color to :root. |
1740 | 53 | move_background_color_up(ctx, &g, tree->root); |
1741 | 53 | } |
1742 | 53 | } |
1743 | 106 | fz_always(ctx) |
1744 | 53 | { |
1745 | 53 | fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); |
1746 | 53 | fz_drop_css(ctx, g.css); |
1747 | 53 | } |
1748 | 53 | fz_catch(ctx) |
1749 | 0 | { |
1750 | 0 | if (rtitle) |
1751 | 0 | { |
1752 | 0 | fz_free(ctx, *rtitle); |
1753 | 0 | *rtitle = NULL; |
1754 | 0 | } |
1755 | | /* Dropping the tree works regardless of whether the tree is part of an fz_html or not. */ |
1756 | 0 | fz_drop_html_tree(ctx, tree); |
1757 | 0 | fz_rethrow(ctx); |
1758 | 0 | } |
1759 | 53 | } |
1760 | | |
1761 | | static const char *mobi_font_size[7] = { |
1762 | | "0.67em", |
1763 | | "0.83em", |
1764 | | "1em", |
1765 | | "1.17em", |
1766 | | "1.33em", |
1767 | | "1.5em", |
1768 | | "1.67em", |
1769 | | }; |
1770 | | |
1771 | | static void |
1772 | | patch_mobi_html(fz_context *ctx, fz_pool *pool, fz_xml *node) |
1773 | 0 | { |
1774 | 0 | fz_xml *down; |
1775 | 0 | char buf[500]; |
1776 | 0 | while (node) |
1777 | 0 | { |
1778 | 0 | char *tag = fz_xml_tag(node); |
1779 | 0 | if (tag) |
1780 | 0 | { |
1781 | | // Read MOBI attributes, convert to inline CSS style |
1782 | 0 | if (!strcmp(tag, "font")) |
1783 | 0 | { |
1784 | 0 | const char *size = fz_xml_att(node, "size"); |
1785 | 0 | if (size) |
1786 | 0 | { |
1787 | 0 | if (!strcmp(size, "1")) size = mobi_font_size[0]; |
1788 | 0 | else if (!strcmp(size, "2")) size = mobi_font_size[1]; |
1789 | 0 | else if (!strcmp(size, "3")) size = mobi_font_size[2]; |
1790 | 0 | else if (!strcmp(size, "4")) size = mobi_font_size[3]; |
1791 | 0 | else if (!strcmp(size, "5")) size = mobi_font_size[4]; |
1792 | 0 | else if (!strcmp(size, "6")) size = mobi_font_size[5]; |
1793 | 0 | else if (!strcmp(size, "7")) size = mobi_font_size[6]; |
1794 | 0 | else if (!strcmp(size, "+1")) size = mobi_font_size[3]; |
1795 | 0 | else if (!strcmp(size, "+2")) size = mobi_font_size[4]; |
1796 | 0 | else if (!strcmp(size, "+3")) size = mobi_font_size[5]; |
1797 | 0 | else if (!strcmp(size, "+4")) size = mobi_font_size[6]; |
1798 | 0 | else if (!strcmp(size, "+5")) size = mobi_font_size[6]; |
1799 | 0 | else if (!strcmp(size, "+6")) size = mobi_font_size[6]; |
1800 | 0 | else if (!strcmp(size, "-1")) size = mobi_font_size[1]; |
1801 | 0 | else if (!strcmp(size, "-2")) size = mobi_font_size[0]; |
1802 | 0 | else if (!strcmp(size, "-3")) size = mobi_font_size[0]; |
1803 | 0 | else if (!strcmp(size, "-4")) size = mobi_font_size[0]; |
1804 | 0 | else if (!strcmp(size, "-5")) size = mobi_font_size[0]; |
1805 | 0 | else if (!strcmp(size, "-6")) size = mobi_font_size[0]; |
1806 | 0 | fz_snprintf(buf, sizeof buf, "font-size:%s", size); |
1807 | 0 | fz_xml_add_att(ctx, pool, node, "style", buf); |
1808 | 0 | } |
1809 | 0 | } |
1810 | 0 | else |
1811 | 0 | { |
1812 | 0 | char *height = fz_xml_att(node, "height"); |
1813 | 0 | char *width = fz_xml_att(node, "width"); |
1814 | 0 | char *align = fz_xml_att(node, "align"); |
1815 | 0 | if (height || width || align) |
1816 | 0 | { |
1817 | 0 | buf[0] = 0; |
1818 | 0 | if (height) |
1819 | 0 | { |
1820 | 0 | fz_strlcat(buf, "margin-top:", sizeof buf); |
1821 | 0 | fz_strlcat(buf, height, sizeof buf); |
1822 | 0 | fz_strlcat(buf, ";", sizeof buf); |
1823 | 0 | } |
1824 | 0 | if (width) |
1825 | 0 | { |
1826 | 0 | fz_strlcat(buf, "text-indent:", sizeof buf); |
1827 | 0 | fz_strlcat(buf, width, sizeof buf); |
1828 | 0 | fz_strlcat(buf, ";", sizeof buf); |
1829 | 0 | } |
1830 | 0 | if (align) |
1831 | 0 | { |
1832 | 0 | fz_strlcat(buf, "text-align:", sizeof buf); |
1833 | 0 | fz_strlcat(buf, align, sizeof buf); |
1834 | 0 | fz_strlcat(buf, ";", sizeof buf); |
1835 | 0 | } |
1836 | 0 | fz_xml_add_att(ctx, pool, node, "style", buf); |
1837 | 0 | } |
1838 | 0 | if (!strcmp(tag, "img")) |
1839 | 0 | { |
1840 | 0 | char *recindex = fz_xml_att(node, "recindex"); |
1841 | 0 | if (recindex) |
1842 | 0 | fz_xml_add_att(ctx, pool, node, "src", recindex); |
1843 | 0 | } |
1844 | 0 | } |
1845 | 0 | } |
1846 | |
|
1847 | 0 | down = fz_xml_down(node); |
1848 | 0 | if (down) |
1849 | 0 | patch_mobi_html(ctx, pool, down); |
1850 | |
|
1851 | 0 | node = fz_xml_next(node); |
1852 | 0 | } |
1853 | 0 | } |
1854 | | |
1855 | | static void |
1856 | | fz_parse_html_tree(fz_context *ctx, |
1857 | | fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, |
1858 | | int try_xml, int try_html5, fz_html_tree *tree, char **rtitle, int try_fictionbook, int patch_mobi) |
1859 | 14 | { |
1860 | 14 | fz_xml_doc *xml; |
1861 | | |
1862 | 14 | if (rtitle) |
1863 | 14 | *rtitle = NULL; |
1864 | | |
1865 | 14 | xml = parse_to_xml(ctx, buf, try_xml, try_html5); |
1866 | | |
1867 | 14 | if (patch_mobi) |
1868 | 0 | patch_mobi_html(ctx, xml->u.doc.pool, xml); |
1869 | | |
1870 | 28 | fz_try(ctx) |
1871 | 28 | xml_to_boxes(ctx, set, zip, base_uri, user_css, xml, tree, rtitle, try_fictionbook, patch_mobi); |
1872 | 28 | fz_always(ctx) |
1873 | 14 | fz_drop_xml(ctx, xml); |
1874 | 14 | fz_catch(ctx) |
1875 | 0 | fz_rethrow(ctx); |
1876 | 14 | } |
1877 | | |
1878 | | #define fz_new_derived_html_tree(CTX, TYPE, DROP) \ |
1879 | 53 | ((TYPE *)Memento_label(fz_new_html_tree_of_size(CTX, sizeof(TYPE), DROP), #TYPE)) |
1880 | | |
1881 | | static fz_html_tree * |
1882 | | fz_new_html_tree_of_size(fz_context *ctx, size_t size, fz_store_drop_fn *drop) |
1883 | 53 | { |
1884 | 53 | fz_pool *pool = fz_new_pool(ctx); |
1885 | 53 | fz_html_tree *tree; |
1886 | | |
1887 | 106 | fz_try(ctx) |
1888 | 106 | { |
1889 | 53 | tree = fz_pool_alloc(ctx, pool, size); |
1890 | 53 | FZ_INIT_STORABLE(tree, 1, drop); |
1891 | 53 | tree->pool = pool; |
1892 | 53 | } |
1893 | 106 | fz_catch(ctx) |
1894 | 0 | { |
1895 | 0 | fz_drop_pool(ctx, pool); |
1896 | 0 | fz_rethrow(ctx); |
1897 | 0 | } |
1898 | | |
1899 | 53 | return tree; |
1900 | 53 | } |
1901 | | |
1902 | | fz_html * |
1903 | | fz_parse_html(fz_context *ctx, |
1904 | | fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css, |
1905 | | int try_xml, int try_html5, int patch_mobi) |
1906 | 14 | { |
1907 | 14 | fz_html *html = fz_new_derived_html_tree(ctx, fz_html, fz_drop_html_imp); |
1908 | | |
1909 | 14 | html->layout_w = 0; |
1910 | 14 | html->layout_h = 0; |
1911 | 14 | html->layout_em = 0; |
1912 | | |
1913 | 28 | fz_try(ctx) |
1914 | 28 | fz_parse_html_tree(ctx, set, zip, base_uri, buf, user_css, try_xml, try_html5, &html->tree, &html->title, 1, patch_mobi); |
1915 | 28 | fz_catch(ctx) |
1916 | 0 | { |
1917 | 0 | fz_drop_html(ctx, html); |
1918 | 0 | fz_rethrow(ctx); |
1919 | 0 | } |
1920 | | |
1921 | 14 | return html; |
1922 | 14 | } |
1923 | | |
1924 | | typedef struct |
1925 | | { |
1926 | | int saved; |
1927 | | fz_warning_cb *old; |
1928 | | void *arg; |
1929 | | fz_buffer *buffer; |
1930 | | fz_context *ctx; |
1931 | | } warning_save; |
1932 | | |
1933 | | static void |
1934 | | warn_to_buffer(void *user, const char *message) |
1935 | 5 | { |
1936 | 5 | warning_save *save = (warning_save *)user; |
1937 | 5 | fz_context *ctx = save->ctx; |
1938 | | |
1939 | 10 | fz_try(ctx) |
1940 | 10 | { |
1941 | 5 | fz_append_string(ctx, save->buffer, message); |
1942 | 5 | fz_append_byte(ctx, save->buffer, '\n'); |
1943 | 5 | } |
1944 | 10 | fz_catch(ctx) |
1945 | 0 | { |
1946 | | /* Silently swallow the error. */ |
1947 | 0 | fz_rethrow_if(ctx, FZ_ERROR_SYSTEM); |
1948 | 0 | fz_report_error(ctx); |
1949 | 0 | } |
1950 | 5 | } |
1951 | | |
1952 | | static void |
1953 | | redirect_warnings_to_buffer(fz_context *ctx, fz_buffer *buf, warning_save *save) |
1954 | 78 | { |
1955 | 78 | save->saved = 1; |
1956 | 78 | save->old = fz_warning_callback(ctx, &save->arg); |
1957 | 78 | save->buffer = buf; |
1958 | 78 | save->ctx = ctx; |
1959 | | |
1960 | 78 | fz_flush_warnings(ctx); |
1961 | 78 | fz_set_warning_callback(ctx, warn_to_buffer, save); |
1962 | 78 | } |
1963 | | |
1964 | | static void |
1965 | | restore_warnings(fz_context *ctx, warning_save *save) |
1966 | 78 | { |
1967 | 78 | if (!save->saved) |
1968 | 0 | return; |
1969 | | |
1970 | 78 | fz_flush_warnings(ctx); |
1971 | 78 | fz_set_warning_callback(ctx, save->old, save->arg); |
1972 | 78 | } |
1973 | | |
1974 | | fz_story * |
1975 | | fz_new_story(fz_context *ctx, fz_buffer *buf, const char *user_css, float em, fz_archive *zip) |
1976 | 39 | { |
1977 | 39 | fz_story *story = fz_new_derived_html_tree(ctx, fz_story, fz_drop_story_imp); |
1978 | 39 | warning_save saved = { 0 }; |
1979 | 39 | fz_buffer *local_buffer = NULL; |
1980 | | |
1981 | 39 | if (buf == NULL) |
1982 | 0 | { |
1983 | 0 | local_buffer = fz_new_buffer(ctx, 0); |
1984 | 0 | buf = local_buffer; |
1985 | 0 | } |
1986 | | |
1987 | 39 | fz_var(local_buffer); |
1988 | 39 | fz_var(saved); |
1989 | | |
1990 | 78 | fz_try(ctx) |
1991 | 78 | { |
1992 | 39 | story->zip = fz_keep_archive(ctx, zip); |
1993 | 39 | story->font_set = fz_new_html_font_set(ctx); |
1994 | 39 | story->em = em; |
1995 | 39 | story->user_css = user_css ? fz_strdup(ctx, user_css) : NULL; |
1996 | 39 | story->warnings = fz_new_buffer(ctx, 128); |
1997 | 39 | redirect_warnings_to_buffer(ctx, story->warnings, &saved); |
1998 | 39 | story->dom = parse_to_xml(ctx, buf, 0, 1); |
1999 | 39 | } |
2000 | 78 | fz_always(ctx) |
2001 | 39 | { |
2002 | 39 | restore_warnings(ctx, &saved); |
2003 | 39 | fz_drop_buffer(ctx, local_buffer); |
2004 | 39 | } |
2005 | 39 | fz_catch(ctx) |
2006 | 0 | { |
2007 | 0 | fz_drop_html_tree(ctx, &story->tree); |
2008 | 0 | fz_rethrow(ctx); |
2009 | 0 | } |
2010 | | |
2011 | 39 | return story; |
2012 | 39 | } |
2013 | | |
2014 | | fz_html * |
2015 | | fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css) |
2016 | 0 | { |
2017 | | /* try as XML first, fall back to HTML5 */ |
2018 | 0 | return fz_parse_html(ctx, set, zip, base_uri, buf, user_css, 1, 1, 0); |
2019 | 0 | } |
2020 | | |
2021 | | static void indent(int level) |
2022 | 0 | { |
2023 | 0 | while (level-- > 0) |
2024 | 0 | putchar('\t'); |
2025 | 0 | } |
2026 | | |
2027 | | static void |
2028 | | fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level) |
2029 | 0 | { |
2030 | 0 | fz_html_box *sbox = NULL; |
2031 | 0 | while (flow) |
2032 | 0 | { |
2033 | 0 | if (flow->box != sbox) { |
2034 | 0 | sbox = flow->box; |
2035 | 0 | indent(level); |
2036 | 0 | #ifndef NDEBUG |
2037 | 0 | printf("@style <%s> em=%g font='%s'", sbox->tag, sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); |
2038 | | #else |
2039 | | printf("@style em=%g font='%s'", sbox->s.layout.em, fz_font_name(ctx, sbox->style->font)); |
2040 | | #endif |
2041 | 0 | if (fz_font_is_serif(ctx, sbox->style->font)) |
2042 | 0 | printf(" serif"); |
2043 | 0 | else |
2044 | 0 | printf(" sans"); |
2045 | 0 | if (fz_font_is_monospaced(ctx, sbox->style->font)) |
2046 | 0 | printf(" monospaced"); |
2047 | 0 | if (fz_font_is_bold(ctx, sbox->style->font)) |
2048 | 0 | printf(" bold"); |
2049 | 0 | if (fz_font_is_italic(ctx, sbox->style->font)) |
2050 | 0 | printf(" italic"); |
2051 | 0 | if (sbox->style->small_caps) |
2052 | 0 | printf(" small-caps"); |
2053 | 0 | printf("\n"); |
2054 | 0 | } |
2055 | |
|
2056 | 0 | indent(level); |
2057 | 0 | switch (flow->type) { |
2058 | 0 | case FLOW_WORD: printf("word "); break; |
2059 | 0 | case FLOW_SPACE: printf("space"); break; |
2060 | 0 | case FLOW_SBREAK: printf("sbrk "); break; |
2061 | 0 | case FLOW_SHYPHEN: printf("shy "); break; |
2062 | 0 | case FLOW_BREAK: printf("break"); break; |
2063 | 0 | case FLOW_IMAGE: printf("image"); break; |
2064 | 0 | case FLOW_ANCHOR: printf("anchor"); break; |
2065 | 0 | } |
2066 | | // printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w); |
2067 | 0 | if (flow->type == FLOW_IMAGE) |
2068 | 0 | printf(" h=%g", flow->h); |
2069 | 0 | if (flow->type == FLOW_WORD) |
2070 | 0 | printf(" text='%s'", flow->content.text); |
2071 | 0 | printf("\n"); |
2072 | 0 | if (flow->breaks_line) { |
2073 | 0 | indent(level); |
2074 | 0 | printf("*\n"); |
2075 | 0 | } |
2076 | |
|
2077 | 0 | flow = flow->next; |
2078 | 0 | } |
2079 | 0 | } |
2080 | | |
2081 | | fz_structure fz_html_tag_to_structure(const char *tag) |
2082 | 1.58k | { |
2083 | 1.58k | if (!strcmp(tag, "body")) return FZ_STRUCTURE_DOCUMENT; |
2084 | 1.52k | if (!strcmp(tag, "div")) return FZ_STRUCTURE_DIV; |
2085 | 1.51k | if (!strcmp(tag, "span")) return FZ_STRUCTURE_SPAN; |
2086 | 1.51k | if (!strcmp(tag, "blockquote")) return FZ_STRUCTURE_BLOCKQUOTE; |
2087 | 1.51k | if (!strcmp(tag, "p")) return FZ_STRUCTURE_P; |
2088 | 279 | if (!strcmp(tag, "h1")) return FZ_STRUCTURE_H1; |
2089 | 276 | if (!strcmp(tag, "h2")) return FZ_STRUCTURE_H2; |
2090 | 276 | if (!strcmp(tag, "h3")) return FZ_STRUCTURE_H3; |
2091 | 276 | if (!strcmp(tag, "h4")) return FZ_STRUCTURE_H4; |
2092 | 276 | if (!strcmp(tag, "h5")) return FZ_STRUCTURE_H5; |
2093 | 276 | if (!strcmp(tag, "h6")) return FZ_STRUCTURE_H6; |
2094 | 276 | if (!strcmp(tag, "ol")) return FZ_STRUCTURE_LIST; |
2095 | 276 | if (!strcmp(tag, "ul")) return FZ_STRUCTURE_LIST; |
2096 | 276 | if (!strcmp(tag, "dl")) return FZ_STRUCTURE_LIST; |
2097 | 276 | if (!strcmp(tag, "li")) return FZ_STRUCTURE_LISTITEM; |
2098 | 276 | if (!strcmp(tag, "table")) return FZ_STRUCTURE_TABLE; |
2099 | 276 | if (!strcmp(tag, "tr")) return FZ_STRUCTURE_TR; |
2100 | 276 | if (!strcmp(tag, "th")) return FZ_STRUCTURE_TH; |
2101 | 276 | if (!strcmp(tag, "td")) return FZ_STRUCTURE_TD; |
2102 | 276 | if (!strcmp(tag, "thead")) return FZ_STRUCTURE_THEAD; |
2103 | 276 | if (!strcmp(tag, "tbody")) return FZ_STRUCTURE_TBODY; |
2104 | 276 | if (!strcmp(tag, "tfoot")) return FZ_STRUCTURE_TFOOT; |
2105 | 276 | return FZ_STRUCTURE_INVALID; |
2106 | 276 | } |
2107 | | |
2108 | | static void |
2109 | | fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level) |
2110 | 0 | { |
2111 | 0 | while (box) |
2112 | 0 | { |
2113 | 0 | indent(level); |
2114 | 0 | printf("box "); |
2115 | 0 | switch (box->type) { |
2116 | 0 | case BOX_BLOCK: printf("block"); break; |
2117 | 0 | case BOX_FLOW: printf("flow"); break; |
2118 | 0 | case BOX_INLINE: printf("inline"); break; |
2119 | 0 | case BOX_TABLE: printf("table"); break; |
2120 | 0 | case BOX_TABLE_ROW: printf("table-row"); break; |
2121 | 0 | case BOX_TABLE_CELL: printf("table-cell"); break; |
2122 | 0 | } |
2123 | | |
2124 | 0 | printf(" <%s>", box->tag); |
2125 | | // printf(" em=%g", box->em); |
2126 | | // printf(" x=%g y=%g w=%g b=%g", box->x, box->y, box->w, box->b); |
2127 | |
|
2128 | 0 | if (box->is_first_flow) |
2129 | 0 | printf(" is-first-flow"); |
2130 | 0 | if (box->list_item) |
2131 | 0 | printf(" list=%d", box->list_item); |
2132 | 0 | if (box->id) |
2133 | 0 | printf(" id=(%s)", box->id); |
2134 | 0 | if (box->href) |
2135 | 0 | printf(" href=(%s)", box->href); |
2136 | 0 | printf("\n"); |
2137 | |
|
2138 | 0 | if (box->type == BOX_BLOCK || box->type == BOX_TABLE) { |
2139 | 0 | indent(level+1); |
2140 | 0 | printf(">margin=(%g %g %g %g)\n", box->u.block.margin[0], box->u.block.margin[1], box->u.block.margin[2], box->u.block.margin[3]); |
2141 | | //indent(level+1); |
2142 | | //printf(">padding=(%g %g %g %g)\n", box->u.block.padding[0], box->u.block.padding[1], box->u.block.padding[2], box->u.block.padding[3]); |
2143 | | //indent(level+1); |
2144 | | //printf(">border=(%g %g %g %g)\n", box->u.block.border[0], box->u.block.border[1], box->u.block.border[2], box->u.block.border[3]); |
2145 | 0 | } |
2146 | |
|
2147 | 0 | if (box->down) |
2148 | 0 | fz_debug_html_box(ctx, box->down, level + 1); |
2149 | 0 | if (box->type == BOX_FLOW) { |
2150 | 0 | indent(level+1); |
2151 | 0 | printf("flow\n"); |
2152 | 0 | fz_debug_html_flow(ctx, box->u.flow.head, level + 2); |
2153 | 0 | } |
2154 | |
|
2155 | 0 | box = box->next; |
2156 | 0 | } |
2157 | 0 | } |
2158 | | |
2159 | | void |
2160 | | fz_debug_html(fz_context *ctx, fz_html_box *box) |
2161 | 0 | { |
2162 | 0 | fz_debug_html_box(ctx, box, 0); |
2163 | 0 | } |
2164 | | |
2165 | | static size_t |
2166 | | fz_html_size(fz_context *ctx, fz_html *html) |
2167 | 0 | { |
2168 | 0 | return html ? fz_pool_size(ctx, html->tree.pool) : 0; |
2169 | 0 | } |
2170 | | |
2171 | | /* Magic to make html storable. */ |
2172 | | typedef struct { |
2173 | | int refs; |
2174 | | void *doc; |
2175 | | int chapter_num; |
2176 | | } fz_html_key; |
2177 | | |
2178 | | static int |
2179 | | fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_) |
2180 | 0 | { |
2181 | 0 | fz_html_key *key = (fz_html_key *)key_; |
2182 | 0 | hash->u.pi.ptr = key->doc; |
2183 | 0 | hash->u.pi.i = key->chapter_num; |
2184 | 0 | return 1; |
2185 | 0 | } |
2186 | | |
2187 | | static void * |
2188 | | fz_keep_html_key(fz_context *ctx, void *key_) |
2189 | 0 | { |
2190 | 0 | fz_html_key *key = (fz_html_key *)key_; |
2191 | 0 | return fz_keep_imp(ctx, key, &key->refs); |
2192 | 0 | } |
2193 | | |
2194 | | static void |
2195 | | fz_drop_html_key(fz_context *ctx, void *key_) |
2196 | 0 | { |
2197 | 0 | fz_html_key *key = (fz_html_key *)key_; |
2198 | 0 | if (fz_drop_imp(ctx, key, &key->refs)) |
2199 | 0 | { |
2200 | 0 | fz_free(ctx, key); |
2201 | 0 | } |
2202 | 0 | } |
2203 | | |
2204 | | static int |
2205 | | fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_) |
2206 | 0 | { |
2207 | 0 | fz_html_key *k0 = (fz_html_key *)k0_; |
2208 | 0 | fz_html_key *k1 = (fz_html_key *)k1_; |
2209 | 0 | return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num; |
2210 | 0 | } |
2211 | | |
2212 | | static void |
2213 | | fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_) |
2214 | 0 | { |
2215 | 0 | fz_html_key *key = (fz_html_key *)key_; |
2216 | 0 | fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num); |
2217 | 0 | } |
2218 | | |
2219 | | static const fz_store_type fz_html_store_type = |
2220 | | { |
2221 | | "fz_html", |
2222 | | fz_make_hash_html_key, |
2223 | | fz_keep_html_key, |
2224 | | fz_drop_html_key, |
2225 | | fz_cmp_html_key, |
2226 | | fz_format_html_key, |
2227 | | NULL |
2228 | | }; |
2229 | | |
2230 | | fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter) |
2231 | 0 | { |
2232 | 0 | fz_html_key *key = NULL; |
2233 | 0 | fz_html *other_html; |
2234 | | |
2235 | | /* Stick the parsed html in the store */ |
2236 | 0 | fz_var(key); |
2237 | |
|
2238 | 0 | fz_try(ctx) |
2239 | 0 | { |
2240 | 0 | key = fz_malloc_struct(ctx, fz_html_key); |
2241 | 0 | key->refs = 1; |
2242 | 0 | key->doc = doc; |
2243 | 0 | key->chapter_num = chapter; |
2244 | 0 | other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type); |
2245 | 0 | if (other_html) |
2246 | 0 | { |
2247 | 0 | fz_drop_html(ctx, html); |
2248 | 0 | html = other_html; |
2249 | 0 | } |
2250 | 0 | } |
2251 | 0 | fz_always(ctx) |
2252 | 0 | fz_drop_html_key(ctx, key); |
2253 | 0 | fz_catch(ctx) |
2254 | 0 | { |
2255 | | /* Do nothing */ |
2256 | 0 | } |
2257 | |
|
2258 | 0 | return html; |
2259 | 0 | } |
2260 | | |
2261 | | fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter) |
2262 | 0 | { |
2263 | 0 | fz_html_key key; |
2264 | |
|
2265 | 0 | key.refs = 1; |
2266 | 0 | key.doc = doc; |
2267 | 0 | key.chapter_num = chapter; |
2268 | 0 | return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type); |
2269 | 0 | } |
2270 | | |
2271 | | static int |
2272 | | html_filter_store(fz_context *ctx, void *doc, void *key_) |
2273 | 0 | { |
2274 | 0 | fz_html_key *key = (fz_html_key *)key_; |
2275 | |
|
2276 | 0 | return (doc == key->doc); |
2277 | 0 | } |
2278 | | |
2279 | | void fz_purge_stored_html(fz_context *ctx, void *doc) |
2280 | 0 | { |
2281 | 0 | fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type); |
2282 | 0 | } |
2283 | | |
2284 | | static void |
2285 | | convert_to_boxes(fz_context *ctx, fz_story *story) |
2286 | 39 | { |
2287 | 39 | warning_save saved = { 0 }; |
2288 | | |
2289 | 39 | if (story->dom == NULL) |
2290 | 0 | return; |
2291 | | |
2292 | 39 | fz_var(saved); |
2293 | | |
2294 | 78 | fz_try(ctx) |
2295 | 78 | { |
2296 | 39 | redirect_warnings_to_buffer(ctx, story->warnings, &saved); |
2297 | 39 | xml_to_boxes(ctx, story->font_set, story->zip, ".", story->user_css, story->dom, &story->tree, NULL, 0, 0); |
2298 | 39 | } |
2299 | 78 | fz_always(ctx) |
2300 | 39 | { |
2301 | 39 | fz_drop_xml(ctx, story->dom); |
2302 | 39 | story->dom = NULL; |
2303 | 39 | restore_warnings(ctx, &saved); |
2304 | 39 | } |
2305 | 39 | fz_catch(ctx) |
2306 | 0 | fz_rethrow(ctx); |
2307 | 39 | } |
2308 | | |
2309 | | int fz_place_story(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled) |
2310 | 39 | { |
2311 | 39 | return fz_place_story_flags(ctx, story, where, filled, 0); |
2312 | 39 | } |
2313 | | |
2314 | | int fz_place_story_flags(fz_context *ctx, fz_story *story, fz_rect where, fz_rect *filled, int flags) |
2315 | 39 | { |
2316 | 39 | float w, h; |
2317 | | |
2318 | 39 | if (filled) |
2319 | 0 | *filled = fz_empty_rect; |
2320 | | |
2321 | 39 | if (story == NULL || story->complete) |
2322 | 0 | return 0; |
2323 | | |
2324 | | /* Convert from XML to box model on the first attempt to place. |
2325 | | * The DOM is unusable from here on in. */ |
2326 | 39 | convert_to_boxes(ctx, story); |
2327 | | |
2328 | 39 | w = where.x1 - where.x0; |
2329 | 39 | h = where.y1 - where.y0; |
2330 | | /* Confusingly, we call the layout using restart_draw, not restart_place, |
2331 | | * because we don't want to destroy the current values in restart_place |
2332 | | * in case we have to retry later. This means the values are left in |
2333 | | * the correct struct though! */ |
2334 | 39 | story->restart_draw.start = story->restart_place.start; |
2335 | 39 | story->restart_draw.start_flow = story->restart_place.start_flow; |
2336 | 39 | story->restart_draw.end = NULL; |
2337 | 39 | story->restart_draw.end_flow = NULL; |
2338 | 39 | story->restart_draw.reason = FZ_HTML_RESTART_REASON_NONE; |
2339 | 39 | story->restart_draw.flags = flags; |
2340 | 39 | story->bbox = where; |
2341 | 39 | fz_restartable_layout_html(ctx, &story->tree, where.x0, where.y0, w, h, story->em, &story->restart_draw); |
2342 | 39 | story->restart_draw.start = story->restart_place.start; |
2343 | 39 | story->restart_draw.start_flow = story->restart_place.start_flow; |
2344 | | |
2345 | 39 | if (filled) |
2346 | 0 | { |
2347 | 0 | fz_html_box *b = story->tree.root; |
2348 | 0 | filled->x0 = b->s.layout.x - b->u.block.margin[L] - b->u.block.border[L] - b->u.block.padding[L]; |
2349 | 0 | filled->x1 = b->s.layout.w + b->u.block.margin[R] + b->u.block.border[R] + b->u.block.padding[R] + b->s.layout.x; |
2350 | 0 | filled->y0 = b->s.layout.y - b->u.block.margin[T] - b->u.block.border[T] - b->u.block.padding[T]; |
2351 | 0 | filled->y1 = b->s.layout.b + b->u.block.margin[B] + b->u.block.border[B] + b->u.block.padding[B]; |
2352 | 0 | } |
2353 | | |
2354 | 39 | #ifndef NDEBUG |
2355 | 39 | if (fz_atoi(getenv("FZ_DEBUG_HTML"))) |
2356 | 0 | fz_debug_html(ctx, story->tree.root); |
2357 | 39 | #endif |
2358 | | |
2359 | 39 | if (story->restart_draw.end == NULL) |
2360 | 39 | return FZ_HTML_RESTART_REASON_NONE; |
2361 | 0 | if (story->restart_draw.reason == FZ_HTML_RESTART_REASON_LINE_WIDTH) |
2362 | 0 | return FZ_HTML_RESTART_REASON_LINE_WIDTH; |
2363 | 0 | return FZ_HTML_RESTART_REASON_LINE_HEIGHT; |
2364 | 0 | } |
2365 | | |
2366 | | const char * |
2367 | | fz_story_warnings(fz_context *ctx, fz_story *story) |
2368 | 0 | { |
2369 | 0 | unsigned char *data; |
2370 | |
|
2371 | 0 | if (!story) |
2372 | 0 | return NULL; |
2373 | | |
2374 | 0 | convert_to_boxes(ctx, story); |
2375 | |
|
2376 | 0 | fz_terminate_buffer(ctx, story->warnings); |
2377 | |
|
2378 | 0 | if (fz_buffer_storage(ctx, story->warnings, &data) == 0) |
2379 | 0 | return NULL; |
2380 | | |
2381 | 0 | return (const char *)data; |
2382 | 0 | } |