/src/mupdf/source/fitz/xml.c
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (C) 2004-2022 Artifex Software, Inc. |
2 | | // |
3 | | // This file is part of MuPDF. |
4 | | // |
5 | | // MuPDF is free software: you can redistribute it and/or modify it under the |
6 | | // terms of the GNU Affero General Public License as published by the Free |
7 | | // Software Foundation, either version 3 of the License, or (at your option) |
8 | | // any later version. |
9 | | // |
10 | | // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY |
11 | | // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | | // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more |
13 | | // details. |
14 | | // |
15 | | // You should have received a copy of the GNU Affero General Public License |
16 | | // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> |
17 | | // |
18 | | // Alternative licensing terms are available from the licensor. |
19 | | // For commercial licensing, see <https://www.artifex.com/> or contact |
20 | | // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, |
21 | | // CA 94129, USA, for further information. |
22 | | |
23 | | #include "xml-imp.h" |
24 | | |
25 | | #include <string.h> |
26 | | #include <stdlib.h> |
27 | | #include <stdio.h> |
28 | | |
29 | | #include <gumbo.h> |
30 | | |
31 | 33.9k | #define FZ_XML_MAX_DEPTH 4096 |
32 | | |
33 | | /* #define FZ_XML_SEQ */ |
34 | | |
35 | | static const struct { const char *name; int c; } html_entities[] = { |
36 | | {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163}, |
37 | | {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167}, |
38 | | {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171}, |
39 | | {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176}, |
40 | | {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180}, |
41 | | {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184}, |
42 | | {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188}, |
43 | | {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192}, |
44 | | {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196}, |
45 | | {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200}, |
46 | | {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204}, |
47 | | {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208}, |
48 | | {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212}, |
49 | | {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216}, |
50 | | {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220}, |
51 | | {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224}, |
52 | | {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228}, |
53 | | {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232}, |
54 | | {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236}, |
55 | | {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240}, |
56 | | {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244}, |
57 | | {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248}, |
58 | | {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252}, |
59 | | {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62}, |
60 | | {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339}, |
61 | | {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710}, |
62 | | {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201}, |
63 | | {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207}, |
64 | | {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217}, |
65 | | {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222}, |
66 | | {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249}, |
67 | | {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913}, |
68 | | {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917}, |
69 | | {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922}, |
70 | | {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927}, |
71 | | {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933}, |
72 | | {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945}, |
73 | | {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949}, |
74 | | {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954}, |
75 | | {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959}, |
76 | | {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964}, |
77 | | {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969}, |
78 | | {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226}, |
79 | | {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254}, |
80 | | {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476}, |
81 | | {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593}, |
82 | | {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629}, |
83 | | {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659}, |
84 | | {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707}, |
85 | | {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713}, |
86 | | {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722}, |
87 | | {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734}, |
88 | | {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746}, |
89 | | {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773}, |
90 | | {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805}, |
91 | | {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838}, |
92 | | {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869}, |
93 | | {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970}, |
94 | | {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674}, |
95 | | {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830}, |
96 | | }; |
97 | | |
98 | | struct parser |
99 | | { |
100 | | fz_pool *pool; |
101 | | fz_xml *head; |
102 | | int preserve_white; |
103 | | int depth; |
104 | | #ifdef FZ_XML_SEQ |
105 | | int seq; |
106 | | #endif |
107 | | }; |
108 | | |
109 | | static void xml_indent(fz_context *ctx, fz_output *out, int n) |
110 | 0 | { |
111 | 0 | while (n--) { |
112 | 0 | fz_write_byte(ctx, out, ' '); |
113 | 0 | fz_write_byte(ctx, out, ' '); |
114 | 0 | } |
115 | 0 | } |
116 | | |
117 | | void fz_debug_xml(fz_xml *item, int level) |
118 | 0 | { |
119 | | /* This is a bit nasty as it relies on implementation |
120 | | * details of both fz_stdout, and fz_write_printf coping |
121 | | * with NULL ctx. */ |
122 | 0 | fz_output_xml(NULL, fz_stdout(NULL), item, level); |
123 | 0 | } |
124 | | |
125 | | void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level) |
126 | 0 | { |
127 | 0 | char *s; |
128 | |
|
129 | 0 | if (item == NULL) |
130 | 0 | return; |
131 | | |
132 | | /* Skip over the DOC object at the top. */ |
133 | 0 | if (item->up == NULL) |
134 | 0 | { |
135 | 0 | fz_xml *child; |
136 | 0 | for (child = fz_xml_down(item); child; child = child->u.node.next) |
137 | 0 | fz_output_xml(ctx, out, child, level + 1); |
138 | 0 | return; |
139 | 0 | } |
140 | | |
141 | 0 | s = fz_xml_text(item); |
142 | 0 | xml_indent(ctx, out, level); |
143 | 0 | if (s) |
144 | 0 | { |
145 | 0 | int c; |
146 | 0 | fz_write_byte(ctx, out, '"'); |
147 | 0 | while (*s) { |
148 | 0 | s += fz_chartorune(&c, s); |
149 | 0 | switch (c) { |
150 | 0 | default: |
151 | 0 | if (c > 0xFFFF) |
152 | 0 | fz_write_printf(ctx, out, "\\u{%X}", c); |
153 | 0 | else if (c < 32 || c > 127) |
154 | 0 | fz_write_printf(ctx, out, "\\u%04X", c); |
155 | 0 | else |
156 | 0 | fz_write_byte(ctx, out, c); |
157 | 0 | break; |
158 | 0 | case '\\': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, '\\'); break; |
159 | 0 | case '\b': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'b'); break; |
160 | 0 | case '\f': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'f'); break; |
161 | 0 | case '\n': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'n'); break; |
162 | 0 | case '\r': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'r'); break; |
163 | 0 | case '\t': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 't'); break; |
164 | 0 | } |
165 | 0 | } |
166 | 0 | fz_write_byte(ctx, out, '"'); |
167 | | #ifdef FZ_XML_SEQ |
168 | | fz_write_printf(ctx, out, " <%d>", item->seq); |
169 | | #endif |
170 | 0 | fz_write_byte(ctx, out, '\n'); |
171 | 0 | } |
172 | 0 | else |
173 | 0 | { |
174 | 0 | fz_xml *child; |
175 | 0 | struct attribute *att; |
176 | |
|
177 | | #ifdef FZ_XML_SEQ |
178 | | fz_write_printf(ctx, out, "(%s <%d>\n", item->u.node.u.d.name, item->u.node.seq); |
179 | | #else |
180 | 0 | fz_write_printf(ctx, out, "(%s\n", item->u.node.u.d.name); |
181 | 0 | #endif |
182 | 0 | for (att = item->u.node.u.d.atts; att; att = att->next) |
183 | 0 | { |
184 | 0 | xml_indent(ctx, out, level); |
185 | 0 | fz_write_printf(ctx, out, "=%s %s\n", att->name, att->value); |
186 | 0 | } |
187 | 0 | for (child = fz_xml_down(item); child; child = child->u.node.next) |
188 | 0 | fz_output_xml(ctx, out, child, level + 1); |
189 | 0 | xml_indent(ctx, out, level); |
190 | | #ifdef FZ_XML_SEQ |
191 | | fz_write_printf(ctx, out, ")%s <%d>\n", item->u.node.u.d.name, item->u.node.seq); |
192 | | #else |
193 | 0 | fz_write_printf(ctx, out, ")%s\n", item->u.node.u.d.name); |
194 | 0 | #endif |
195 | 0 | } |
196 | 0 | } |
197 | | |
198 | | fz_xml *fz_xml_prev(fz_xml *item) |
199 | 0 | { |
200 | 0 | return item && item->up ? item->u.node.prev : NULL; |
201 | 0 | } |
202 | | |
203 | | fz_xml *fz_xml_next(fz_xml *item) |
204 | 27.7k | { |
205 | 27.7k | return item && item->up ? item->u.node.next : NULL; |
206 | 27.7k | } |
207 | | |
208 | | fz_xml *fz_xml_up(fz_xml *item) |
209 | 174k | { |
210 | | /* Never step up to the DOC. */ |
211 | 174k | return item && item->up && item->up->up ? item->up : NULL; |
212 | 174k | } |
213 | | |
214 | | fz_xml *fz_xml_down(fz_xml *item) |
215 | 19.8k | { |
216 | | /* DOC items can never have MAGIC_TEXT as their down value, |
217 | | * so this is safe. */ |
218 | 19.8k | return item && !FZ_TEXT_ITEM(item) ? item->down : NULL; |
219 | 19.8k | } |
220 | | |
221 | | char *fz_xml_text(fz_xml *item) |
222 | 10.6k | { |
223 | | /* DOC items can never have MAGIC_TEXT as their down value, |
224 | | * so this is safe. */ |
225 | 10.6k | return (item && FZ_TEXT_ITEM(item)) ? item->u.node.u.text : NULL; |
226 | 10.6k | } |
227 | | |
228 | | char *fz_xml_tag(fz_xml *item) |
229 | 12.6k | { |
230 | | /* DOC items can never have MAGIC_TEXT as their down value, |
231 | | * so this is safe. */ |
232 | 12.6k | return item && !FZ_TEXT_ITEM(item) ? item->u.node.u.d.name : NULL; |
233 | 12.6k | } |
234 | | |
235 | | int fz_xml_is_tag(fz_xml *item, const char *name) |
236 | 222k | { |
237 | 222k | if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item)) |
238 | 5.82k | return 0; |
239 | 216k | return !strcmp(item->u.node.u.d.name, name); |
240 | 222k | } |
241 | | |
242 | | char *fz_xml_att(fz_xml *item, const char *name) |
243 | 174k | { |
244 | 174k | struct attribute *att; |
245 | 174k | if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item)) |
246 | 1.13k | return NULL; |
247 | 300k | for (att = item->u.node.u.d.atts; att; att = att->next) |
248 | 130k | if (!strcmp(att->name, name)) |
249 | 3.74k | return att->value; |
250 | 170k | return NULL; |
251 | 173k | } |
252 | | |
253 | | char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two) |
254 | 5.68k | { |
255 | 5.68k | char *val = fz_xml_att(item, one); |
256 | 5.68k | if (!val) |
257 | 5.68k | val = fz_xml_att(item, two); |
258 | 5.68k | return val; |
259 | 5.68k | } |
260 | | |
261 | | fz_xml *fz_xml_find(fz_xml *item, const char *tag) |
262 | 279 | { |
263 | | /* Skip over any DOC item. */ |
264 | 279 | if (item && FZ_DOCUMENT_ITEM(item)) |
265 | 0 | item = item->down; |
266 | | |
267 | 302 | while (item) |
268 | 238 | { |
269 | 238 | if (!FZ_TEXT_ITEM(item) && !strcmp(item->u.node.u.d.name, tag)) |
270 | 215 | return item; |
271 | 23 | item = item->u.node.next; |
272 | 23 | } |
273 | 64 | return NULL; |
274 | 279 | } |
275 | | |
276 | | fz_xml *fz_xml_find_next(fz_xml *item, const char *tag) |
277 | 0 | { |
278 | | /* Skip over any DOC item. */ |
279 | 0 | if (item && FZ_DOCUMENT_ITEM(item)) |
280 | 0 | item = item->down; |
281 | |
|
282 | 0 | if (item) |
283 | 0 | item = item->u.node.next; |
284 | 0 | return fz_xml_find(item, tag); |
285 | 0 | } |
286 | | |
287 | | fz_xml *fz_xml_find_down(fz_xml *item, const char *tag) |
288 | 159 | { |
289 | 159 | if (item) |
290 | 159 | item = fz_xml_down(item); |
291 | 159 | return fz_xml_find(item, tag); |
292 | 159 | } |
293 | | |
294 | | int fz_xml_att_eq(fz_xml *item, const char *name, const char *match) |
295 | 424 | { |
296 | 424 | const char *val = fz_xml_att(item, name); |
297 | | |
298 | 424 | return val ? !strcmp(val, match) : 0; |
299 | 424 | } |
300 | | |
301 | | fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match) |
302 | 0 | { |
303 | | /* Skip over any document item. */ |
304 | 0 | if (item && FZ_DOCUMENT_ITEM(item)) |
305 | 0 | item = item->down; |
306 | |
|
307 | 0 | while (1) |
308 | 0 | { |
309 | 0 | item = tag ? fz_xml_find(item, tag) : item; |
310 | 0 | if (item == NULL || fz_xml_att_eq(item, att, match)) |
311 | 0 | break; |
312 | 0 | item = item->u.node.next; |
313 | 0 | } |
314 | |
|
315 | 0 | return item; |
316 | 0 | } |
317 | | |
318 | | fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match) |
319 | 0 | { |
320 | | /* Skip over any document item. */ |
321 | 0 | if (item && FZ_DOCUMENT_ITEM(item)) |
322 | 0 | item = item->down; |
323 | |
|
324 | 0 | if (item != NULL) |
325 | 0 | { |
326 | 0 | do |
327 | 0 | { |
328 | 0 | item = tag ? fz_xml_find_next(item, tag) : item->u.node.next; |
329 | 0 | } |
330 | 0 | while (item != NULL && !fz_xml_att_eq(item, att, match)); |
331 | 0 | } |
332 | |
|
333 | 0 | return item; |
334 | 0 | } |
335 | | |
336 | | fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match) |
337 | 0 | { |
338 | 0 | return fz_xml_find_match(fz_xml_down(item), tag, att, match); |
339 | 0 | } |
340 | | |
341 | | fz_xml *fz_xml_root(fz_xml *xml) |
342 | 413 | { |
343 | 413 | if (xml == NULL) |
344 | 0 | return NULL; |
345 | | |
346 | | /* If we've been given a node mid-tree, run up to the root to find |
347 | | * the doc node. */ |
348 | 413 | while (xml->up) |
349 | 0 | xml = xml->up; |
350 | | |
351 | | /* And the root is the child of the doc.*/ |
352 | 413 | return xml->down; |
353 | 413 | } |
354 | | |
355 | | void fz_drop_xml(fz_context *ctx, fz_xml *xml) |
356 | 43.9k | { |
357 | 43.9k | if (!xml) |
358 | 43.3k | return; |
359 | | |
360 | | /* Whereever we are in the tree, we want the doc node at the root. */ |
361 | 610 | while (xml->up) |
362 | 0 | xml = xml->up; |
363 | | |
364 | | /* Drop a reference to the tree as a whole. */ |
365 | 610 | if (fz_drop_imp(ctx, xml, &xml->u.doc.refs) == 0) |
366 | 0 | return; |
367 | | |
368 | 610 | fz_drop_pool(ctx, xml->u.doc.pool); |
369 | 610 | } |
370 | | |
371 | | void fz_detach_xml(fz_context *ctx, fz_xml *node) |
372 | 0 | { |
373 | 0 | fz_xml *doc = node; |
374 | | |
375 | | /* If we're already a document node, then this is a NOP. */ |
376 | 0 | if (doc->up == NULL) |
377 | 0 | return; |
378 | | |
379 | | /* Move doc to be the doc pointer at the top of the tree. */ |
380 | 0 | while (doc->up) |
381 | 0 | { |
382 | 0 | doc = doc->up; |
383 | 0 | } |
384 | | |
385 | | /* Relocate node to be the child of doc. */ |
386 | 0 | node->up->down = NULL; |
387 | 0 | doc->down = node; |
388 | | |
389 | | /* NOTE: Suppose that X = doc->down on entry. On exit doc->down == node, but |
390 | | * X->up = doc. We need to be careful throughout this code to not assume that |
391 | | * Y is always a child of Y->up. */ |
392 | 0 | } |
393 | | |
394 | | size_t xml_parse_entity(int *c, const char *a) |
395 | 1.72k | { |
396 | 1.72k | char *b; |
397 | 1.72k | size_t i; |
398 | | |
399 | 1.72k | if (a[1] == '#') { |
400 | 6 | if (a[2] == 'x') |
401 | 1 | *c = strtol(a + 3, &b, 16); |
402 | 5 | else |
403 | 5 | *c = strtol(a + 2, &b, 10); |
404 | 6 | if (*b == ';') |
405 | 1 | return b - a + 1; |
406 | 6 | } |
407 | 1.71k | else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') { |
408 | 7 | *c = '<'; |
409 | 7 | return 4; |
410 | 7 | } |
411 | 1.71k | else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') { |
412 | 3 | *c = '>'; |
413 | 3 | return 4; |
414 | 3 | } |
415 | 1.70k | else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') { |
416 | 1 | *c = '&'; |
417 | 1 | return 5; |
418 | 1 | } |
419 | 1.70k | else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') { |
420 | 0 | *c = '\''; |
421 | 0 | return 6; |
422 | 0 | } |
423 | 1.70k | else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') { |
424 | 0 | *c = '"'; |
425 | 0 | return 6; |
426 | 0 | } |
427 | | |
428 | | /* We should only be doing this for XHTML, but it shouldn't be a problem. */ |
429 | 434k | for (i = 0; i < nelem(html_entities); ++i) { |
430 | 432k | size_t n = strlen(html_entities[i].name); |
431 | 432k | if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') { |
432 | 0 | *c = html_entities[i].c; |
433 | 0 | return n + 2; |
434 | 0 | } |
435 | 432k | } |
436 | | |
437 | 1.71k | *c = *a; |
438 | 1.71k | return 1; |
439 | 1.71k | } |
440 | | |
441 | | static inline int isname(int c) |
442 | 473k | { |
443 | 473k | return c == '.' || c == '-' || c == '_' || c == ':' || |
444 | 473k | (c >= '0' && c <= '9') || |
445 | 473k | (c >= 'A' && c <= 'Z') || |
446 | 473k | (c >= 'a' && c <= 'z'); |
447 | 473k | } |
448 | | |
449 | | static inline int iswhite(int c) |
450 | 175k | { |
451 | 175k | return c == ' ' || c == '\r' || c == '\n' || c == '\t'; |
452 | 175k | } |
453 | | |
454 | | static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, const char *a, const char *b, int is_text) |
455 | 33.9k | { |
456 | 33.9k | fz_xml *head, *tail; |
457 | 33.9k | const char *ns; |
458 | 33.9k | size_t size; |
459 | | |
460 | 33.9k | if (is_text) |
461 | 8.71k | size = offsetof(fz_xml, u.node.u.text) + b-a+1; |
462 | 25.2k | else |
463 | 25.2k | { |
464 | | /* skip namespace prefix */ |
465 | 143k | for (ns = a; ns < b - 1; ++ns) |
466 | 118k | if (*ns == ':') |
467 | 18.6k | a = ns + 1; |
468 | | |
469 | 25.2k | size = offsetof(fz_xml, u.node.u.d.name) + b-a+1; |
470 | 25.2k | } |
471 | 33.9k | head = fz_pool_alloc(ctx, parser->pool, size); |
472 | | |
473 | 33.9k | if (is_text) |
474 | 8.71k | head->down = MAGIC_TEXT; |
475 | 25.2k | else |
476 | 25.2k | { |
477 | 25.2k | memcpy(head->u.node.u.d.name, a, b - a); |
478 | 25.2k | head->u.node.u.d.name[b - a] = 0; |
479 | 25.2k | head->u.node.u.d.atts = NULL; |
480 | 25.2k | head->down = NULL; |
481 | 25.2k | } |
482 | | |
483 | 33.9k | head->up = parser->head; |
484 | 33.9k | head->u.node.next = NULL; |
485 | | #ifdef FZ_XML_SEQ |
486 | | head->u.node.seq = parser->seq++; |
487 | | #endif |
488 | | |
489 | | /* During construction, we use head->next to mean "the |
490 | | * tail of the children. When we close the tag, we |
491 | | * rewrite it to be NULL. */ |
492 | 33.9k | if (!parser->head->down) { |
493 | 13.0k | parser->head->down = head; |
494 | 13.0k | parser->head->u.node.next = head; |
495 | 13.0k | head->u.node.prev = NULL; |
496 | 13.0k | } |
497 | 20.9k | else { |
498 | 20.9k | tail = parser->head->u.node.next; |
499 | 20.9k | tail->u.node.next = head; |
500 | 20.9k | head->u.node.prev = tail; |
501 | 20.9k | parser->head->u.node.next = head; |
502 | 20.9k | } |
503 | | |
504 | 33.9k | parser->head = head; |
505 | 33.9k | parser->depth++; |
506 | 33.9k | if (parser->depth >= FZ_XML_MAX_DEPTH) |
507 | 0 | fz_throw(ctx, FZ_ERROR_SYNTAX, "too deep xml element nesting"); |
508 | 33.9k | } |
509 | | |
510 | | static void xml_emit_att_name(fz_context *ctx, struct parser *parser, const char *a, const char *b) |
511 | 24.9k | { |
512 | 24.9k | fz_xml *head = parser->head; |
513 | 24.9k | struct attribute *att; |
514 | 24.9k | size_t size; |
515 | | |
516 | 24.9k | size = offsetof(struct attribute, name) + b-a+1; |
517 | 24.9k | att = fz_pool_alloc(ctx, parser->pool, size); |
518 | 24.9k | memcpy(att->name, a, b - a); |
519 | 24.9k | att->name[b - a] = 0; |
520 | 24.9k | att->value = NULL; |
521 | 24.9k | att->next = head->u.node.u.d.atts; |
522 | 24.9k | head->u.node.u.d.atts = att; |
523 | 24.9k | } |
524 | | |
525 | | void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val) |
526 | 0 | { |
527 | 0 | size_t size = offsetof(struct attribute, name) + strlen(key) + 1; |
528 | 0 | struct attribute *att = fz_pool_alloc(ctx, pool, size); |
529 | 0 | memcpy(att->name, key, strlen(key)+1); |
530 | 0 | att->value = fz_pool_alloc(ctx, pool, strlen(val) + 1); |
531 | 0 | memcpy(att->value, val, strlen(val)+1); |
532 | 0 | att->next = node->u.node.u.d.atts; |
533 | 0 | node->u.node.u.d.atts = att; |
534 | 0 | } |
535 | | |
536 | | static void xml_emit_att_value(fz_context *ctx, struct parser *parser, const char *a, const char *b) |
537 | 24.9k | { |
538 | 24.9k | fz_xml *head = parser->head; |
539 | 24.9k | struct attribute *att = head->u.node.u.d.atts; |
540 | 24.9k | char *s; |
541 | 24.9k | int c; |
542 | | |
543 | | /* entities are all longer than UTFmax so runetochar is safe */ |
544 | 24.9k | s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1); |
545 | 237k | while (a < b) { |
546 | 212k | if (*a == '&') { |
547 | 131 | a += xml_parse_entity(&c, a); |
548 | 131 | s += fz_runetochar(s, c); |
549 | 131 | } |
550 | 212k | else { |
551 | 212k | *s++ = *a++; |
552 | 212k | } |
553 | 212k | } |
554 | 24.9k | *s = 0; |
555 | 24.9k | } |
556 | | |
557 | | static void xml_emit_close_tag(fz_context *ctx, struct parser *parser) |
558 | 31.6k | { |
559 | 31.6k | parser->depth--; |
560 | 31.6k | parser->head->u.node.next = NULL; |
561 | 31.6k | if (parser->head->up) |
562 | 31.6k | parser->head = parser->head->up; |
563 | 31.6k | } |
564 | | |
565 | | static void xml_emit_text(fz_context *ctx, struct parser *parser, const char *a, const char *b) |
566 | 9.11k | { |
567 | 9.11k | fz_xml *head; |
568 | 9.11k | const char *p; |
569 | 9.11k | char *s; |
570 | 9.11k | int c; |
571 | | |
572 | | /* Skip text outside the root tag */ |
573 | 9.11k | if (parser->depth == 0) |
574 | 249 | return; |
575 | | |
576 | | /* Skip all-whitespace text nodes */ |
577 | 8.86k | if (!parser->preserve_white) |
578 | 1.05k | { |
579 | 1.76k | for (p = a; p < b; p++) |
580 | 1.60k | if (!iswhite(*p)) |
581 | 900 | break; |
582 | 1.05k | if (p == b) |
583 | 155 | return; |
584 | 1.05k | } |
585 | | |
586 | 8.71k | xml_emit_open_tag(ctx, parser, a, b, 1); |
587 | 8.71k | head = parser->head; |
588 | | |
589 | | /* entities are all longer than UTFmax so runetochar is safe */ |
590 | 8.71k | s = fz_xml_text(head); |
591 | 279k | while (a < b) { |
592 | 271k | if (*a == '&') { |
593 | 1.59k | a += xml_parse_entity(&c, a); |
594 | 1.59k | s += fz_runetochar(s, c); |
595 | 1.59k | } |
596 | 269k | else { |
597 | 269k | *s++ = *a++; |
598 | 269k | } |
599 | 271k | } |
600 | 8.71k | *s = 0; |
601 | | |
602 | 8.71k | xml_emit_close_tag(ctx, parser); |
603 | 8.71k | } |
604 | | |
605 | | static void xml_emit_cdata(fz_context *ctx, struct parser *parser, const char *a, const char *b) |
606 | 0 | { |
607 | 0 | fz_xml *head; |
608 | 0 | char *s; |
609 | |
|
610 | 0 | xml_emit_open_tag(ctx, parser, a, b, 1); |
611 | 0 | head = parser->head; |
612 | |
|
613 | 0 | s = head->u.node.u.text; |
614 | 0 | while (a < b) |
615 | 0 | *s++ = *a++; |
616 | 0 | *s = 0; |
617 | |
|
618 | 0 | xml_emit_close_tag(ctx, parser); |
619 | 0 | } |
620 | | |
621 | | static int close_tag(fz_context *ctx, struct parser *parser, const char *mark, const char *p) |
622 | 9.82k | { |
623 | 9.82k | const char *ns, *tag; |
624 | | |
625 | | /* skip namespace prefix */ |
626 | 54.1k | for (ns = mark; ns < p - 1; ++ns) |
627 | 44.3k | if (*ns == ':') |
628 | 9.09k | mark = ns + 1; |
629 | | |
630 | 9.82k | tag = fz_xml_tag(parser->head); |
631 | 9.82k | if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0) |
632 | 9.82k | { |
633 | 9.82k | xml_emit_close_tag(ctx, parser); |
634 | 9.82k | return 0; |
635 | 9.82k | } |
636 | 7 | return 1; |
637 | 9.82k | } |
638 | | |
639 | | static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, const char *p) /* lgtm [cpp/use-of-goto] */ |
640 | 634 | { |
641 | 634 | const char *mark; |
642 | 634 | int quote; |
643 | | |
644 | 35.5k | parse_text: |
645 | 35.5k | mark = p; |
646 | 444k | while (*p && *p != '<') ++p; |
647 | 35.5k | if (*p == '<') { |
648 | 34.9k | if (mark < p) |
649 | 8.12k | xml_emit_text(ctx, parser, mark, p); |
650 | 34.9k | ++p; |
651 | 34.9k | goto parse_element; |
652 | 34.9k | } else if (mark < p) |
653 | 357 | xml_emit_text(ctx, parser, mark, p); |
654 | 557 | return NULL; |
655 | | |
656 | 34.9k | parse_element: |
657 | 34.9k | if (*p == '/') { ++p; goto parse_closing_element; } |
658 | 25.1k | if (*p == '!') { ++p; goto parse_comment; } |
659 | 25.1k | if (*p == '?') { ++p; goto parse_processing_instruction; } |
660 | 24.6k | while (iswhite(*p)) ++p; |
661 | 24.6k | if (isname(*p)) |
662 | 24.6k | goto parse_element_name; |
663 | 6 | return "syntax error in element"; |
664 | | |
665 | 27 | parse_comment: |
666 | 27 | if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E') |
667 | 19 | goto parse_declaration; |
668 | 8 | if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y') |
669 | 0 | goto parse_declaration; |
670 | 8 | if (*p == '[') goto parse_cdata; |
671 | 8 | if (*p++ != '-') return "syntax error in comment (<! not followed by --)"; |
672 | 5 | if (*p++ != '-') return "syntax error in comment (<!- not followed by -)"; |
673 | 20 | while (*p) { |
674 | 20 | if (p[0] == '-' && p[1] == '-' && p[2] == '>') { |
675 | 5 | p += 3; |
676 | 5 | goto parse_text; |
677 | 5 | } |
678 | 15 | ++p; |
679 | 15 | } |
680 | 0 | return "end of data in comment"; |
681 | | |
682 | 19 | parse_declaration: |
683 | 1.02k | while (*p) if (*p++ == '>') goto parse_text; |
684 | 1 | return "end of data in declaration"; |
685 | | |
686 | 0 | parse_cdata: |
687 | 0 | if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[') |
688 | 0 | return "syntax error in CDATA section"; |
689 | 0 | p += 7; |
690 | 0 | mark = p; |
691 | 0 | while (*p) { |
692 | 0 | if (p[0] == ']' && p[1] == ']' && p[2] == '>') { |
693 | 0 | xml_emit_cdata(ctx, parser, mark, p); |
694 | 0 | p += 3; |
695 | 0 | goto parse_text; |
696 | 0 | } |
697 | 0 | ++p; |
698 | 0 | } |
699 | 0 | return "end of data in CDATA section"; |
700 | | |
701 | 457 | parse_processing_instruction: |
702 | 12.3k | while (*p) { |
703 | 12.3k | if (p[0] == '?' && p[1] == '>') { |
704 | 455 | p += 2; |
705 | 455 | goto parse_text; |
706 | 455 | } |
707 | 11.8k | ++p; |
708 | 11.8k | } |
709 | 2 | return "end of data in processing instruction"; |
710 | | |
711 | 9.82k | parse_closing_element: |
712 | 9.82k | while (iswhite(*p)) ++p; |
713 | 9.82k | mark = p; |
714 | 63.9k | while (isname(*p)) ++p; |
715 | 9.82k | if (!isname(*mark)) |
716 | 0 | return "syntax error in closing element"; |
717 | 9.82k | if (close_tag(ctx, parser, mark, p)) |
718 | 7 | return "opening and closing tag mismatch"; |
719 | 9.82k | while (iswhite(*p)) ++p; |
720 | 9.82k | if (*p != '>') |
721 | 1 | return "syntax error in closing element"; |
722 | 9.81k | ++p; |
723 | 9.81k | goto parse_text; |
724 | | |
725 | 24.6k | parse_element_name: |
726 | 24.6k | mark = p; |
727 | 166k | while (isname(*p)) ++p; |
728 | 24.6k | xml_emit_open_tag(ctx, parser, mark, p, 0); |
729 | 24.6k | if (*p == '>') { |
730 | 9.07k | ++p; |
731 | 9.07k | goto parse_text; |
732 | 9.07k | } |
733 | 15.5k | if (p[0] == '/' && p[1] == '>') { |
734 | 1.50k | xml_emit_close_tag(ctx, parser); |
735 | 1.50k | p += 2; |
736 | 1.50k | goto parse_text; |
737 | 1.50k | } |
738 | 14.0k | if (iswhite(*p)) |
739 | 14.0k | goto parse_attributes; |
740 | 18 | return "syntax error after element name"; |
741 | | |
742 | 38.6k | parse_attributes: |
743 | 66.3k | while (iswhite(*p)) ++p; |
744 | 38.6k | if (isname(*p)) |
745 | 24.6k | goto parse_attribute_name; |
746 | 14.0k | if (*p == '>') { |
747 | 2.96k | ++p; |
748 | 2.96k | goto parse_text; |
749 | 2.96k | } |
750 | 11.0k | if (p[0] == '/' && p[1] == '>') { |
751 | 11.0k | xml_emit_close_tag(ctx, parser); |
752 | 11.0k | p += 2; |
753 | 11.0k | goto parse_text; |
754 | 11.0k | } |
755 | 9 | return "syntax error in attributes"; |
756 | | |
757 | 24.6k | parse_attribute_name: |
758 | 24.6k | mark = p; |
759 | 169k | while (isname(*p)) ++p; |
760 | 24.6k | xml_emit_att_name(ctx, parser, mark, p); |
761 | 24.9k | while (iswhite(*p)) ++p; |
762 | 24.6k | if (*p == '=') { ++p; goto parse_attribute_value; } |
763 | 22 | return "syntax error after attribute name"; |
764 | | |
765 | 24.6k | parse_attribute_value: |
766 | 24.6k | while (iswhite(*p)) ++p; |
767 | 24.6k | quote = *p++; |
768 | 24.6k | mark = p; |
769 | | |
770 | | /* special case for handling MOBI filepos=00000 syntax */ |
771 | 24.6k | if (quote >= '0' && quote <= '9') { |
772 | 97 | while (*p >= '0' && *p <= '9') ++p; |
773 | 7 | xml_emit_att_value(ctx, parser, mark, p); |
774 | 7 | goto parse_attributes; |
775 | 7 | } |
776 | | |
777 | 24.6k | if (quote != '"' && quote != '\'') |
778 | 0 | return "missing quote character"; |
779 | 229k | while (*p && *p != quote) ++p; |
780 | 24.6k | if (*p == quote) { |
781 | 24.6k | xml_emit_att_value(ctx, parser, mark, p++); |
782 | 24.6k | goto parse_attributes; |
783 | 24.6k | } |
784 | 8 | return "end of data in attribute value"; |
785 | 24.6k | } |
786 | | |
787 | | static int fast_tolower(int c) |
788 | 1.60M | { |
789 | 1.60M | if ((unsigned)c - 'A' < 26) |
790 | 76.3k | return c | 32; |
791 | 1.52M | return c; |
792 | 1.60M | } |
793 | | |
794 | | static int fast_strncasecmp(const char *a, const char *b, size_t n) |
795 | 56.7k | { |
796 | 56.7k | if (!n--) |
797 | 0 | return 0; |
798 | 57.2k | for (; *a && *b && n && fast_tolower(*a) == fast_tolower(*b); a++, b++, n--) |
799 | 499 | ; |
800 | 56.7k | return fast_tolower(*a) - fast_tolower(*b); |
801 | 56.7k | } |
802 | | |
803 | | static char *fast_strcasestr(char *h, char *n) |
804 | 687 | { |
805 | 687 | int n0 = fast_tolower(*n++); |
806 | 687 | size_t nn = strlen(n); |
807 | 1.37M | while (*h != 0) |
808 | 1.37M | { |
809 | 1.37M | if (fast_tolower(*h) == n0 && fast_strncasecmp(h+1, n, nn) == 0) |
810 | 0 | return h; |
811 | 1.37M | ++h; |
812 | 1.37M | } |
813 | 687 | return NULL; |
814 | 687 | } |
815 | | |
816 | | static int startswith(const char *a, const char *b) |
817 | 12.3k | { |
818 | 12.3k | return !fast_strncasecmp(a, b, strlen(b)); |
819 | 12.3k | } |
820 | | |
821 | | /* https://encoding.spec.whatwg.org/#names-and-labels */ |
822 | | static struct { char *encoding; char *alias; } encoding_aliases[] = { |
823 | | { "big5", "big5" }, |
824 | | { "big5", "big5-hkscs" }, |
825 | | { "big5", "cn-big5" }, |
826 | | { "big5", "csbig5" }, |
827 | | { "big5", "x-x-big5" }, |
828 | | { "euc-cn", "euc-cn" }, |
829 | | { "euc-jp", "cseucpkdfmtjapanese" }, |
830 | | { "euc-jp", "euc-jp" }, |
831 | | { "euc-jp", "x-euc-jp" }, |
832 | | { "euc-kr", "cseuckr" }, |
833 | | { "euc-kr", "csksc56011987" }, |
834 | | { "euc-kr", "euc-kr" }, |
835 | | { "euc-kr", "iso-ir-149" }, |
836 | | { "euc-kr", "korean" }, |
837 | | { "euc-kr", "ks_c_5601" }, |
838 | | { "euc-kr", "ksc5601" }, |
839 | | { "euc-kr", "ksc_5601" }, |
840 | | { "euc-kr", "windows-949" }, |
841 | | { "euc-tw", "euc-tw" }, |
842 | | { "gb18030", "chinese" }, |
843 | | { "gb18030", "csgb2312" }, |
844 | | { "gb18030", "csiso58gb231280" }, |
845 | | { "gb18030", "gb18030" }, |
846 | | { "gb18030", "gb2312" }, |
847 | | { "gb18030", "gb_2312" }, |
848 | | { "gb18030", "gbk" }, |
849 | | { "gb18030", "iso-ir-58" }, |
850 | | { "gb18030", "x-gbk" }, |
851 | | { "iso-8859-1", "ascii" }, |
852 | | { "iso-8859-1", "iso-8859-1" }, |
853 | | { "iso-8859-1", "iso8859-1" }, |
854 | | { "iso-8859-1", "latin1" }, |
855 | | { "iso-8859-1", "us-ascii" }, |
856 | | { "iso-8859-7", "greek" }, |
857 | | { "iso-8859-7", "greek8" }, |
858 | | { "iso-8859-7", "iso-8859-1" }, |
859 | | { "iso-8859-7", "iso8859-1" }, |
860 | | { "koi8-r", "koi" }, |
861 | | { "koi8-r", "koi8" }, |
862 | | { "koi8-r", "koi8-r" }, |
863 | | { "koi8-r", "koi8-ru" }, |
864 | | { "koi8-r", "koi8-u" }, |
865 | | { "koi8-r", "koi8_r" }, |
866 | | { "shift_jis", "csshiftjis" }, |
867 | | { "shift_jis", "ms932" }, |
868 | | { "shift_jis", "ms_kanji" }, |
869 | | { "shift_jis", "shift-jis" }, |
870 | | { "shift_jis", "shift_jis" }, |
871 | | { "shift_jis", "sjis" }, |
872 | | { "shift_jis", "windows-31j" }, |
873 | | { "shift_jis", "x-sjis" }, |
874 | | { "windows-1250", "cp1250" }, |
875 | | { "windows-1250", "windows-1250" }, |
876 | | { "windows-1251", "cp1251" }, |
877 | | { "windows-1251", "windows-1251" }, |
878 | | { "windows-1252", "cp1252" }, |
879 | | { "windows-1252", "cp819" }, |
880 | | { "windows-1252", "windows-1252" }, |
881 | | }; |
882 | | |
883 | | static char *match_encoding_name(char *enc) |
884 | 213 | { |
885 | 213 | size_t i; |
886 | 12.5k | for (i = 0; i < nelem(encoding_aliases); ++i) |
887 | 12.3k | if (startswith(enc, encoding_aliases[i].alias)) |
888 | 0 | return encoding_aliases[i].encoding; |
889 | 213 | return NULL; |
890 | 213 | } |
891 | | |
892 | | // Look for encoding in <meta http-equiv="content-type" content="text/html; charset=XXX"> tags |
893 | | static const char *find_meta_encoding(char *s) |
894 | 687 | { |
895 | 687 | const char *table = NULL; |
896 | 687 | char *end, *meta, *charset, *enc; |
897 | | |
898 | 687 | meta = fast_strcasestr(s, "<meta"); |
899 | 687 | while (meta && !table) |
900 | 0 | { |
901 | 0 | end = strchr(meta, '>'); |
902 | 0 | if (end) |
903 | 0 | { |
904 | 0 | *end = 0; |
905 | 0 | if (fast_strcasestr(meta, "http-equiv") && fast_strcasestr(meta, "content-type")) |
906 | 0 | { |
907 | 0 | charset = fast_strcasestr(meta, "charset="); |
908 | 0 | if (charset) |
909 | 0 | { |
910 | 0 | enc = match_encoding_name(charset + 8); |
911 | 0 | if (enc) |
912 | 0 | table = enc; |
913 | 0 | } |
914 | 0 | } |
915 | 0 | *end = '>'; |
916 | 0 | } |
917 | 0 | meta = fast_strcasestr(meta + 5, "<meta"); |
918 | 0 | } |
919 | | |
920 | 687 | return table; |
921 | 687 | } |
922 | | |
923 | | static const char *find_xml_encoding(char *s) |
924 | 687 | { |
925 | 687 | const char *table = NULL; |
926 | 687 | char *end, *xml, *enc; |
927 | | |
928 | 687 | end = strchr(s, '>'); |
929 | 687 | if (end) |
930 | 677 | { |
931 | 677 | *end = 0; |
932 | 677 | xml = strstr(s, "<?xml"); |
933 | 677 | if (xml) |
934 | 251 | { |
935 | 251 | enc = strstr(xml, "encoding="); |
936 | 251 | if (enc) |
937 | 213 | { |
938 | 213 | enc = match_encoding_name(enc + 10); |
939 | 213 | if (enc) |
940 | 0 | table = enc; |
941 | 213 | } |
942 | 251 | } |
943 | 677 | *end = '>'; |
944 | 677 | } |
945 | | |
946 | 687 | if (!table) |
947 | 687 | table = find_meta_encoding(s); |
948 | | |
949 | 687 | return table; |
950 | 687 | } |
951 | | |
952 | | static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree) |
953 | 687 | { |
954 | 687 | fz_text_decoder dec; |
955 | 687 | const char *enc; |
956 | 687 | const unsigned char *e = s + n; |
957 | 687 | char *dst, *d; |
958 | 687 | int m; |
959 | 687 | int c; |
960 | | |
961 | 687 | if (s[0] == 0xFE && s[1] == 0xFF) { |
962 | 0 | s += 2; |
963 | 0 | dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_be"); |
964 | 0 | while (s + 1 < e) { |
965 | 0 | c = s[0] << 8 | s[1]; |
966 | 0 | d += fz_runetochar(d, c); |
967 | 0 | s += 2; |
968 | 0 | } |
969 | 0 | *d = 0; |
970 | 0 | *dofree = 1; |
971 | 0 | return dst; |
972 | 0 | } |
973 | | |
974 | 687 | if (s[0] == 0xFF && s[1] == 0xFE) { |
975 | 0 | s += 2; |
976 | 0 | dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_le"); |
977 | 0 | while (s + 1 < e) { |
978 | 0 | c = s[0] | s[1] << 8; |
979 | 0 | d += fz_runetochar(d, c); |
980 | 0 | s += 2; |
981 | 0 | } |
982 | 0 | *d = 0; |
983 | 0 | *dofree = 1; |
984 | 0 | return dst; |
985 | 0 | } |
986 | | |
987 | 687 | enc = find_xml_encoding((char*)s); |
988 | 687 | if (enc) |
989 | 0 | { |
990 | 0 | fz_init_text_decoder(ctx, &dec, enc); |
991 | | // NOTE: use decode_size if memory is more important than speed |
992 | 0 | m = dec.decode_bound(&dec, s, n); |
993 | 0 | dst = Memento_label(fz_malloc(ctx, m), "utf8"); |
994 | 0 | dec.decode(&dec, dst, s, n); |
995 | 0 | *dofree = 1; |
996 | 0 | return dst; |
997 | 0 | } |
998 | | |
999 | 687 | *dofree = 0; |
1000 | | |
1001 | 687 | if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF) |
1002 | 24 | return (char*)s+3; |
1003 | | |
1004 | 663 | return (char*)s; |
1005 | 687 | } |
1006 | | |
1007 | | fz_xml * |
1008 | | fz_parse_xml_stream(fz_context *ctx, fz_stream *stm, int preserve_white) |
1009 | 0 | { |
1010 | 0 | fz_buffer *buf = fz_read_all(ctx, stm, 128); |
1011 | 0 | fz_xml *xml = NULL; |
1012 | |
|
1013 | 0 | fz_var(xml); |
1014 | |
|
1015 | 0 | fz_try(ctx) |
1016 | 0 | xml = fz_parse_xml(ctx, buf, preserve_white); |
1017 | 0 | fz_always(ctx) |
1018 | 0 | fz_drop_buffer(ctx, buf); |
1019 | 0 | fz_catch(ctx) |
1020 | 0 | fz_rethrow(ctx); |
1021 | | |
1022 | 0 | return xml; |
1023 | 0 | } |
1024 | | |
1025 | | static fz_xml * |
1026 | | parse_and_drop_buffer(fz_context *ctx, fz_buffer *buf, int preserve_white) |
1027 | 209 | { |
1028 | 209 | fz_xml *xml = NULL; |
1029 | | |
1030 | 209 | fz_var(xml); |
1031 | | |
1032 | 418 | fz_try(ctx) |
1033 | 418 | xml = fz_parse_xml(ctx, buf, preserve_white); |
1034 | 418 | fz_always(ctx) |
1035 | 209 | fz_drop_buffer(ctx, buf); |
1036 | 209 | fz_catch(ctx) |
1037 | 12 | fz_rethrow(ctx); |
1038 | | |
1039 | 197 | return xml; |
1040 | 209 | } |
1041 | | |
1042 | | fz_xml * |
1043 | | fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white) |
1044 | 136 | { |
1045 | 136 | fz_buffer *buf = fz_read_archive_entry(ctx, arch, filename); |
1046 | | |
1047 | 136 | return parse_and_drop_buffer(ctx, buf, preserve_white); |
1048 | 136 | } |
1049 | | |
1050 | | fz_xml * |
1051 | | fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white) |
1052 | 283 | { |
1053 | 283 | fz_buffer *buf = fz_try_read_archive_entry(ctx, arch, filename); |
1054 | | |
1055 | 283 | if (buf == NULL) |
1056 | 179 | return NULL; |
1057 | | |
1058 | 104 | return parse_and_drop_buffer(ctx, buf, preserve_white); |
1059 | 283 | } |
1060 | | |
1061 | | fz_xml * |
1062 | | fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white) |
1063 | 634 | { |
1064 | 634 | struct parser parser; |
1065 | 634 | fz_xml *xml = NULL; |
1066 | 634 | fz_xml root, *node; |
1067 | 634 | char *p = NULL; |
1068 | 634 | char *error; |
1069 | 634 | int dofree = 0; |
1070 | 634 | unsigned char *s; |
1071 | 634 | size_t n; |
1072 | 634 | static unsigned char empty_string[] = ""; |
1073 | | |
1074 | 634 | fz_var(dofree); |
1075 | 634 | fz_var(p); |
1076 | | |
1077 | 634 | if (buf == NULL) |
1078 | 0 | { |
1079 | 0 | n = 0; |
1080 | 0 | s = empty_string; |
1081 | 0 | } |
1082 | 634 | else |
1083 | 634 | { |
1084 | | /* ensure we are zero-terminated */ |
1085 | 634 | fz_terminate_buffer(ctx, buf); |
1086 | 634 | n = fz_buffer_storage(ctx, buf, &s); |
1087 | 634 | } |
1088 | | |
1089 | 634 | memset(&root, 0, sizeof(root)); |
1090 | 634 | parser.pool = fz_new_pool(ctx); |
1091 | 634 | parser.head = &root; |
1092 | 634 | parser.preserve_white = preserve_white; |
1093 | 634 | parser.depth = 0; |
1094 | | #ifdef FZ_XML_SEQ |
1095 | | parser.seq = 0; |
1096 | | #endif |
1097 | | |
1098 | 1.26k | fz_try(ctx) |
1099 | 1.26k | { |
1100 | 634 | p = convert_to_utf8(ctx, s, n, &dofree); |
1101 | | |
1102 | 634 | error = xml_parse_document_imp(ctx, &parser, p); |
1103 | 634 | if (error) |
1104 | 77 | fz_throw(ctx, FZ_ERROR_SYNTAX, "%s", error); |
1105 | | |
1106 | 2.62k | for (node = parser.head; node; node = node->up) |
1107 | 2.06k | node->u.node.next = NULL; |
1108 | | |
1109 | 557 | xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml); |
1110 | 557 | xml->up = NULL; |
1111 | 557 | xml->down = root.down; |
1112 | 557 | xml->u.doc.refs = 1; |
1113 | 557 | xml->u.doc.pool = parser.pool; |
1114 | | |
1115 | 1.11k | for (node = root.down; node; node = node->u.node.next) |
1116 | 555 | node->up = xml; |
1117 | 557 | } |
1118 | 1.26k | fz_always(ctx) |
1119 | 634 | { |
1120 | 634 | if (dofree) |
1121 | 0 | fz_free(ctx, p); |
1122 | 634 | } |
1123 | 634 | fz_catch(ctx) |
1124 | 77 | { |
1125 | 77 | fz_drop_pool(ctx, parser.pool); |
1126 | 77 | fz_rethrow(ctx); |
1127 | 77 | } |
1128 | | |
1129 | 480 | return xml; |
1130 | 557 | } |
1131 | | |
1132 | | /* |
1133 | | Parse the contents of buffer into a tree of XML nodes, using the HTML5 syntax. |
1134 | | |
1135 | | Gumbo doesn't check for malloc errors. Use our pool allocator and let it longjmp |
1136 | | out of Gumbo on allocation errors. At the end (success or fail) we release the |
1137 | | pool used for Gumbo's parse tree all at once. |
1138 | | */ |
1139 | | |
1140 | | struct mem_gumbo { |
1141 | | fz_context *ctx; |
1142 | | fz_pool *pool; |
1143 | | }; |
1144 | | |
1145 | | static void *alloc_gumbo(void *ctx, size_t size) |
1146 | 9.54k | { |
1147 | 9.54k | struct mem_gumbo *mem = ctx; |
1148 | 9.54k | return fz_pool_alloc(mem->ctx, mem->pool, size); |
1149 | 9.54k | } |
1150 | | |
1151 | | static void dealloc_gumbo(void *ctx, void *ptr) |
1152 | 9.64k | { |
1153 | | /* nothing */ |
1154 | 9.64k | } |
1155 | | |
1156 | | static void xml_from_gumbo(fz_context *ctx, struct parser *parser, GumboNode *node) |
1157 | 1.25k | { |
1158 | 1.25k | unsigned int i; |
1159 | 1.25k | const char *tag, *end, *sentinel; |
1160 | | |
1161 | 1.25k | switch (node->type) |
1162 | 1.25k | { |
1163 | 619 | case GUMBO_NODE_ELEMENT: |
1164 | 619 | if (node->v.element.tag != GUMBO_TAG_UNKNOWN) |
1165 | 619 | { |
1166 | 619 | tag = gumbo_normalized_tagname(node->v.element.tag); |
1167 | 619 | end = tag + strlen(tag); |
1168 | 619 | } |
1169 | 0 | else |
1170 | 0 | { |
1171 | 0 | tag = node->v.element.original_tag.data; |
1172 | 0 | sentinel = tag + node->v.element.original_tag.length; |
1173 | 0 | if (tag[0] == '<') |
1174 | 0 | ++tag; |
1175 | 0 | for (end = tag; end < sentinel; ++end) |
1176 | 0 | if (end[0] == '>' || end[0] == '/' || iswhite(end[0])) |
1177 | 0 | break; |
1178 | 0 | } |
1179 | 619 | xml_emit_open_tag(ctx, parser, tag, end, 0); |
1180 | 905 | for (i = 0; i < node->v.element.attributes.length; ++i) |
1181 | 286 | { |
1182 | 286 | GumboAttribute *att = node->v.element.attributes.data[i]; |
1183 | 286 | xml_emit_att_name(ctx, parser, att->name, att->name+strlen(att->name)); |
1184 | 286 | xml_emit_att_value(ctx, parser, att->value, att->value+strlen(att->value)); |
1185 | 286 | } |
1186 | 1.82k | for (i = 0; i < node->v.element.children.length; ++i) |
1187 | 1.20k | { |
1188 | 1.20k | GumboNode *child = node->v.element.children.data[i]; |
1189 | 1.20k | xml_from_gumbo(ctx, parser, child); |
1190 | 1.20k | } |
1191 | 619 | xml_emit_close_tag(ctx, parser); |
1192 | 619 | break; |
1193 | | |
1194 | 247 | case GUMBO_NODE_TEXT: |
1195 | 247 | case GUMBO_NODE_CDATA: |
1196 | 640 | case GUMBO_NODE_WHITESPACE: |
1197 | 640 | xml_emit_text(ctx, parser, node->v.text.text, node->v.text.text+strlen(node->v.text.text)); |
1198 | 640 | break; |
1199 | | |
1200 | 0 | case GUMBO_NODE_DOCUMENT: |
1201 | 0 | case GUMBO_NODE_COMMENT: |
1202 | 0 | case GUMBO_NODE_TEMPLATE: |
1203 | 0 | break; |
1204 | 1.25k | } |
1205 | 1.25k | } |
1206 | | |
1207 | | fz_xml * |
1208 | | fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf) |
1209 | 53 | { |
1210 | 53 | struct parser parser; |
1211 | 53 | fz_xml *xml = NULL; |
1212 | 53 | fz_xml root, *node; |
1213 | 53 | char *p = NULL; |
1214 | 53 | int dofree = 0; |
1215 | 53 | unsigned char *s; |
1216 | 53 | size_t n; |
1217 | 53 | GumboOutput *soup = NULL; |
1218 | 53 | GumboOptions opts; |
1219 | 53 | struct mem_gumbo mem; |
1220 | 53 | static unsigned char empty_string[] = ""; |
1221 | | |
1222 | 53 | fz_var(mem.pool); |
1223 | 53 | fz_var(soup); |
1224 | 53 | fz_var(dofree); |
1225 | 53 | fz_var(p); |
1226 | | |
1227 | 53 | if (buf == NULL) |
1228 | 0 | { |
1229 | 0 | n = 0; |
1230 | 0 | s = empty_string; |
1231 | 0 | } |
1232 | 53 | else |
1233 | 53 | { |
1234 | | /* ensure we are zero-terminated */ |
1235 | 53 | fz_terminate_buffer(ctx, buf); |
1236 | 53 | n = fz_buffer_storage(ctx, buf, &s); |
1237 | 53 | } |
1238 | | |
1239 | 53 | mem.ctx = ctx; |
1240 | 53 | mem.pool = NULL; |
1241 | | |
1242 | 53 | memset(&root, 0, sizeof(root)); |
1243 | 53 | parser.pool = fz_new_pool(ctx); |
1244 | 53 | parser.head = &root; |
1245 | 53 | parser.preserve_white = 1; |
1246 | 53 | parser.depth = 0; |
1247 | | #ifdef FZ_XML_SEQ |
1248 | | parser.seq = 0; |
1249 | | #endif |
1250 | | |
1251 | 106 | fz_try(ctx) |
1252 | 106 | { |
1253 | 53 | p = convert_to_utf8(ctx, s, n, &dofree); |
1254 | | |
1255 | 53 | mem.pool = fz_new_pool(ctx); |
1256 | 53 | memset(&opts, 0, sizeof opts); |
1257 | 53 | opts.allocator = alloc_gumbo; |
1258 | 53 | opts.deallocator = dealloc_gumbo; |
1259 | 53 | opts.userdata = &mem; |
1260 | 53 | opts.tab_stop = 8; |
1261 | 53 | opts.stop_on_first_error = 0; |
1262 | 53 | opts.max_errors = -1; |
1263 | 53 | opts.fragment_context = GUMBO_TAG_LAST; |
1264 | 53 | opts.fragment_namespace = GUMBO_NAMESPACE_HTML; |
1265 | | |
1266 | 53 | soup = gumbo_parse_with_options(&opts, (const char *)p, strlen(p)); |
1267 | | |
1268 | 53 | xml_from_gumbo(ctx, &parser, soup->root); |
1269 | | |
1270 | 106 | for (node = parser.head; node; node = node->up) |
1271 | 53 | node->u.node.next = NULL; |
1272 | | |
1273 | 53 | xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml); |
1274 | 53 | xml->up = NULL; |
1275 | 53 | xml->down = root.down; |
1276 | 53 | xml->u.doc.pool = parser.pool; |
1277 | 53 | xml->u.doc.refs = 1; |
1278 | | |
1279 | 106 | for (node = root.down; node; node = node->u.node.next) |
1280 | 53 | node->up = xml; |
1281 | 53 | } |
1282 | 106 | fz_always(ctx) |
1283 | 53 | { |
1284 | 53 | if (soup) |
1285 | 53 | gumbo_destroy_output(&opts, soup); |
1286 | 53 | fz_drop_pool(ctx, mem.pool); |
1287 | 53 | if (dofree) |
1288 | 0 | fz_free(ctx, p); |
1289 | 53 | } |
1290 | 53 | fz_catch(ctx) |
1291 | 0 | { |
1292 | 0 | fz_drop_pool(ctx, parser.pool); |
1293 | 0 | fz_rethrow(ctx); |
1294 | 0 | } |
1295 | | |
1296 | 53 | return xml; |
1297 | 53 | } |
1298 | | |
1299 | | fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match) |
1300 | 232 | { |
1301 | 232 | return fz_xml_find_dfs_top(item, tag, att, match, NULL); |
1302 | 232 | } |
1303 | | |
1304 | | fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top) |
1305 | 248 | { |
1306 | | /* Skip over any DOC object. */ |
1307 | 248 | if (item && FZ_DOCUMENT_ITEM(item)) |
1308 | 232 | item = item->down; |
1309 | | |
1310 | 33.8k | while (item) |
1311 | 33.8k | { |
1312 | 33.8k | if (!FZ_TEXT_ITEM(item) && (tag == NULL || !strcmp(item->u.node.u.d.name, tag))) |
1313 | 455 | { |
1314 | 455 | if (att == NULL || (match == NULL ? fz_xml_att(item, att) != NULL : fz_xml_att_eq(item, att, match))) |
1315 | 137 | return item; |
1316 | 455 | } |
1317 | | |
1318 | 33.6k | if (!FZ_TEXT_ITEM(item) && item->down) |
1319 | 15.0k | item = item->down; |
1320 | 18.6k | else if (item->u.node.next) |
1321 | 9.55k | item = item->u.node.next; |
1322 | 9.09k | else |
1323 | 14.9k | while (1) { |
1324 | 14.9k | item = item->up; |
1325 | | /* Stop searching if we hit our declared 'top' item. */ |
1326 | 14.9k | if (item == top) |
1327 | 0 | return NULL; |
1328 | | /* We should never reach item == NULL, but just in case. */ |
1329 | 14.9k | if (item == NULL) |
1330 | 0 | return NULL; |
1331 | | /* If we reach the DOC object at the top, we're done. */ |
1332 | 14.9k | if (item->up == NULL) |
1333 | 109 | return NULL; |
1334 | 14.8k | if (item->u.node.next) |
1335 | 8.99k | { |
1336 | 8.99k | item = item->u.node.next; |
1337 | 8.99k | break; |
1338 | 8.99k | } |
1339 | 14.8k | } |
1340 | 33.6k | } |
1341 | | |
1342 | 2 | return NULL; |
1343 | 248 | } |
1344 | | |
1345 | | fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match) |
1346 | 24 | { |
1347 | 24 | return fz_xml_find_next_dfs_top(item, tag, att, match, NULL); |
1348 | 24 | } |
1349 | | |
1350 | | fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top) |
1351 | 24 | { |
1352 | | /* Skip over any DOC object. */ |
1353 | 24 | if (item && FZ_DOCUMENT_ITEM(item)) |
1354 | 0 | item = item->down; |
1355 | | |
1356 | 24 | if (item == NULL) |
1357 | 0 | return NULL; |
1358 | | |
1359 | 24 | if (item->down) |
1360 | 4 | item = item->down; |
1361 | 20 | else if (item->u.node.next) |
1362 | 12 | item = item->u.node.next; |
1363 | 8 | else |
1364 | 16 | while (1) { |
1365 | 16 | item = item->up; |
1366 | | /* Stop searching if we hit our declared 'top' item. */ |
1367 | 16 | if (item == top) |
1368 | 0 | return NULL; |
1369 | | /* We should never reach item == NULL, but just in case. */ |
1370 | 16 | if (item == NULL) |
1371 | 0 | return NULL; |
1372 | | /* If we reach the DOC object at the top, we're done. */ |
1373 | 16 | if (item->up == NULL) |
1374 | 8 | return NULL; |
1375 | 8 | if (item->u.node.next) |
1376 | 0 | { |
1377 | 0 | item = item->u.node.next; |
1378 | 0 | break; |
1379 | 0 | } |
1380 | 8 | } |
1381 | | |
1382 | 16 | return fz_xml_find_dfs_top(item, tag, att, match, top); |
1383 | 24 | } |
1384 | | |
1385 | | fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml) |
1386 | 0 | { |
1387 | 0 | fz_xml *dom = xml; |
1388 | 0 | if (xml == NULL) |
1389 | 0 | return xml; |
1390 | | |
1391 | 0 | while (dom->up) |
1392 | 0 | dom = dom->up; |
1393 | |
|
1394 | 0 | fz_keep_imp(ctx, dom, &dom->u.doc.refs); |
1395 | | |
1396 | | /* Return the original node pointer, not the dom pointer! */ |
1397 | 0 | return xml; |
1398 | 0 | } |