Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * HTMLtree.c : implementation of access function for an HTML tree. |
3 | | * |
4 | | * See Copyright for the status of this software. |
5 | | * |
6 | | * Author: Daniel Veillard |
7 | | */ |
8 | | |
9 | | |
10 | | #define IN_LIBXML |
11 | | #include "libxml.h" |
12 | | #ifdef LIBXML_HTML_ENABLED |
13 | | |
14 | | #include <string.h> /* for memset() only ! */ |
15 | | #include <ctype.h> |
16 | | #include <stdlib.h> |
17 | | |
18 | | #include <libxml/xmlmemory.h> |
19 | | #include <libxml/HTMLparser.h> |
20 | | #include <libxml/HTMLtree.h> |
21 | | #include <libxml/entities.h> |
22 | | #include <libxml/xmlerror.h> |
23 | | #include <libxml/parserInternals.h> |
24 | | #include <libxml/uri.h> |
25 | | |
26 | | #include "private/buf.h" |
27 | | #include "private/html.h" |
28 | | #include "private/error.h" |
29 | | #include "private/html.h" |
30 | | #include "private/io.h" |
31 | | #include "private/save.h" |
32 | | #include "private/tree.h" |
33 | | |
34 | | /************************************************************************ |
35 | | * * |
36 | | * Getting/Setting encoding meta tags * |
37 | | * * |
38 | | ************************************************************************/ |
39 | | |
40 | | typedef struct { |
41 | | xmlAttrPtr attr; /* charset or content */ |
42 | | const xmlChar *attrValue; |
43 | | htmlMetaEncodingOffsets off; |
44 | | } htmlMetaEncoding; |
45 | | |
46 | | static htmlNodePtr |
47 | 6.54k | htmlFindFirstChild(htmlNodePtr parent, const char *name) { |
48 | 6.54k | htmlNodePtr child; |
49 | | |
50 | 14.8k | for (child = parent->children; child != NULL; child = child->next) { |
51 | 13.4k | if ((child->type == XML_ELEMENT_NODE) && |
52 | 13.4k | (xmlStrcasecmp(child->name, BAD_CAST name) == 0)) |
53 | 5.07k | return(child); |
54 | 13.4k | } |
55 | | |
56 | 1.47k | return(NULL); |
57 | 6.54k | } |
58 | | |
59 | | static htmlNodePtr |
60 | 4.24k | htmlFindHead(htmlDocPtr doc) { |
61 | 4.24k | htmlNodePtr html; |
62 | | |
63 | 4.24k | if (doc == NULL) |
64 | 522 | return(NULL); |
65 | | |
66 | 3.71k | html = htmlFindFirstChild((htmlNodePtr) doc, "html"); |
67 | 3.71k | if (html == NULL) |
68 | 887 | return(NULL); |
69 | | |
70 | 2.83k | return(htmlFindFirstChild(html, "head")); |
71 | 3.71k | } |
72 | | |
73 | | int |
74 | 4.71k | htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) { |
75 | 4.71k | const xmlChar *p = val; |
76 | | |
77 | 5.74k | while (1) { |
78 | 5.74k | size_t start, end; |
79 | | |
80 | 47.9k | while ((*p != 'c') && (*p != 'C')) { |
81 | 42.9k | if (*p == 0) |
82 | 740 | return(0); |
83 | 42.1k | p += 1; |
84 | 42.1k | } |
85 | 5.00k | p += 1; |
86 | | |
87 | 5.00k | if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0) |
88 | 514 | continue; |
89 | | |
90 | 4.49k | p += 6; |
91 | 4.98k | while (IS_WS_HTML(*p)) p += 1; |
92 | | |
93 | 4.49k | if (*p != '=') |
94 | 523 | continue; |
95 | | |
96 | 3.97k | p += 1; |
97 | 6.64k | while (IS_WS_HTML(*p)) p += 1; |
98 | | |
99 | 3.97k | if (*p == 0) |
100 | 637 | return(0); |
101 | | |
102 | 3.33k | if ((*p == '"') || (*p == '\'')) { |
103 | 966 | int quote = *p; |
104 | | |
105 | 966 | p += 1; |
106 | 1.40k | while (IS_WS_HTML(*p)) p += 1; |
107 | | |
108 | 966 | start = p - val; |
109 | 966 | end = start; |
110 | | |
111 | 4.11k | while (*p != quote) { |
112 | 3.66k | if (*p == 0) |
113 | 516 | return(0); |
114 | 3.14k | if (!IS_WS_HTML(*p)) |
115 | 2.56k | end = p + 1 - val; |
116 | 3.14k | p += 1; |
117 | 3.14k | } |
118 | 2.36k | } else { |
119 | 2.36k | start = p - val; |
120 | | |
121 | 6.00M | while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p))) |
122 | 6.00M | p += 1; |
123 | | |
124 | 2.36k | end = p - val; |
125 | 2.36k | } |
126 | | |
127 | 2.81k | off->start = start; |
128 | 2.81k | off->end = end; |
129 | 2.81k | off->size = p - val + strlen((char *) p); |
130 | | |
131 | 2.81k | return(1); |
132 | 3.33k | } |
133 | | |
134 | 0 | return(0); |
135 | 4.71k | } |
136 | | |
137 | | static xmlAttrPtr |
138 | 2.01k | htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) { |
139 | 2.01k | xmlAttrPtr attr, contentAttr = NULL; |
140 | 2.01k | int isContentType = 0; |
141 | | |
142 | 2.01k | if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0) |
143 | 0 | return(NULL); |
144 | | |
145 | 5.36k | for (attr = elem->properties; attr != NULL; attr = attr->next) { |
146 | 3.35k | if (attr->ns != NULL) |
147 | 238 | continue; |
148 | 3.11k | if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) { |
149 | 0 | *outIsContentType = 0; |
150 | 0 | return(attr); |
151 | 0 | } |
152 | 3.11k | if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0) |
153 | 1.49k | contentAttr = attr; |
154 | 3.11k | if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) && |
155 | 3.11k | (attr->children != NULL) && |
156 | 3.11k | (attr->children->type == XML_TEXT_NODE) && |
157 | 3.11k | (attr->children->next == NULL) && |
158 | 3.11k | (xmlStrcasecmp(attr->children->content, |
159 | 1.49k | BAD_CAST "Content-Type") == 0)) |
160 | 1.49k | isContentType = 1; |
161 | 3.11k | } |
162 | | |
163 | 2.01k | if ((isContentType) && (contentAttr != NULL)) { |
164 | 1.49k | *outIsContentType = 1; |
165 | 1.49k | return(contentAttr); |
166 | 1.49k | } |
167 | | |
168 | 514 | return(NULL); |
169 | 2.01k | } |
170 | | |
171 | | static int |
172 | 3.14k | htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) { |
173 | 3.14k | xmlAttrPtr attr; |
174 | 3.14k | const xmlChar *val = NULL; |
175 | 3.14k | int isContentType; |
176 | | |
177 | 3.14k | if ((elem->type != XML_ELEMENT_NODE) || |
178 | 3.14k | (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)) |
179 | 1.13k | return(0); |
180 | | |
181 | 2.01k | attr = htmlFindMetaEncodingAttr(elem, &isContentType); |
182 | 2.01k | if (attr == NULL) |
183 | 514 | return(0); |
184 | | |
185 | 1.49k | if ((attr->children != NULL) && |
186 | 1.49k | (attr->children->type == XML_TEXT_NODE) && |
187 | 1.49k | (attr->children->next == NULL) && |
188 | 1.49k | (attr->children->content != NULL)) |
189 | 1.49k | val = attr->children->content; |
190 | 0 | else |
191 | 0 | val = BAD_CAST ""; |
192 | | |
193 | | |
194 | 1.49k | if (!isContentType) { |
195 | 0 | size_t size = strlen((char *) val); |
196 | 0 | size_t start = 0; |
197 | 0 | size_t end = size; |
198 | |
|
199 | 0 | while ((start < size) && (IS_WS_HTML(val[start]))) |
200 | 0 | start += 1; |
201 | |
|
202 | 0 | while ((end > 0) && (IS_WS_HTML(val[end-1]))) |
203 | 0 | end -= 1; |
204 | |
|
205 | 0 | menc->attr = attr; |
206 | 0 | menc->attrValue = val; |
207 | 0 | menc->off.start = start; |
208 | 0 | menc->off.end = end; |
209 | 0 | menc->off.size = size; |
210 | |
|
211 | 0 | return(1); |
212 | 1.49k | } else { |
213 | 1.49k | if (htmlParseContentType(val, &menc->off)) { |
214 | 1.07k | menc->attr = attr; |
215 | 1.07k | menc->attrValue = val; |
216 | | |
217 | 1.07k | return(1); |
218 | 1.07k | } |
219 | 1.49k | } |
220 | | |
221 | 419 | return(0); |
222 | 1.49k | } |
223 | | |
224 | | static xmlChar * |
225 | 0 | htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) { |
226 | 0 | xmlChar *newVal, *p; |
227 | 0 | size_t size, oldEncSize, newEncSize; |
228 | | |
229 | | /* |
230 | | * The pseudo "HTML" encoding only produces ASCII. |
231 | | */ |
232 | 0 | if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0) |
233 | 0 | encoding = "ASCII"; |
234 | |
|
235 | 0 | oldEncSize = menc->off.end - menc->off.start; |
236 | 0 | newEncSize = strlen((char *) encoding); |
237 | 0 | size = menc->off.size - oldEncSize + newEncSize; |
238 | 0 | newVal = xmlMalloc(size + 1); |
239 | 0 | if (newVal == NULL) |
240 | 0 | return(NULL); |
241 | | |
242 | 0 | p = newVal; |
243 | 0 | memcpy(p, menc->attrValue, menc->off.start); |
244 | 0 | p += menc->off.start; |
245 | 0 | memcpy(p, encoding, newEncSize); |
246 | 0 | p += newEncSize; |
247 | 0 | memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end); |
248 | 0 | newVal[size] = 0; |
249 | |
|
250 | 0 | return(newVal); |
251 | 0 | } |
252 | | |
253 | | /** |
254 | | * Look up and encoding declaration in the meta tags. |
255 | | * |
256 | | * The returned string points into attribute content and can contain |
257 | | * trailing garbage. It should be copied before modifying or freeing |
258 | | * nodes. |
259 | | * |
260 | | * @param doc the document |
261 | | * @returns the encoding ot NULL if not found. |
262 | | */ |
263 | | const xmlChar * |
264 | 4.24k | htmlGetMetaEncoding(xmlDoc *doc) { |
265 | 4.24k | htmlNodePtr head, node; |
266 | | |
267 | 4.24k | head = htmlFindHead(doc); |
268 | 4.24k | if (head == NULL) |
269 | 1.99k | return(NULL); |
270 | | |
271 | 4.31k | for (node = head->children; node != NULL; node = node->next) { |
272 | 3.14k | htmlMetaEncoding menc; |
273 | | |
274 | 3.14k | if (htmlParseMetaEncoding(node, &menc)) { |
275 | | /* |
276 | | * Returning a `const xmlChar *` only allows to return |
277 | | * a suffix. In http-equiv meta tags, there could be |
278 | | * more data after the charset, although it's probably |
279 | | * rare in practice. |
280 | | */ |
281 | 1.07k | return(menc.attrValue + menc.off.start); |
282 | 1.07k | } |
283 | 3.14k | } |
284 | | |
285 | 1.16k | return(NULL); |
286 | 2.24k | } |
287 | | |
288 | | /** |
289 | | * Creates or updates a meta tag with an encoding declaration. |
290 | | * |
291 | | * NOTE: This will not change the document content encoding. |
292 | | * |
293 | | * @param doc the document |
294 | | * @param encoding the encoding string |
295 | | * @returns 0 in case of success, 1 if no head element was found or |
296 | | * arguments are invalid and -1 if memory allocation failed. |
297 | | */ |
298 | | int |
299 | 0 | htmlSetMetaEncoding(xmlDoc *doc, const xmlChar *encoding) { |
300 | 0 | htmlNodePtr head, meta; |
301 | 0 | int found = 0; |
302 | |
|
303 | 0 | if (encoding == NULL) |
304 | 0 | return(1); |
305 | | |
306 | 0 | head = htmlFindHead(doc); |
307 | 0 | if (head == NULL) |
308 | 0 | return(1); |
309 | | |
310 | 0 | for (meta = head->children; meta != NULL; meta = meta->next) { |
311 | 0 | htmlMetaEncoding menc; |
312 | |
|
313 | 0 | if (htmlParseMetaEncoding(meta, &menc)) { |
314 | 0 | xmlChar *newVal; |
315 | 0 | int ret; |
316 | |
|
317 | 0 | found = 1; |
318 | |
|
319 | 0 | newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding); |
320 | 0 | if (newVal == NULL) |
321 | 0 | return(-1); |
322 | 0 | xmlNodeSetContent((xmlNodePtr) menc.attr, NULL); |
323 | 0 | ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal); |
324 | 0 | xmlFree(newVal); |
325 | |
|
326 | 0 | if (ret < 0) |
327 | 0 | return(-1); |
328 | 0 | } |
329 | 0 | } |
330 | | |
331 | 0 | if (found) |
332 | 0 | return(0); |
333 | | |
334 | 0 | meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL); |
335 | 0 | if (meta == NULL) |
336 | 0 | return(-1); |
337 | | |
338 | 0 | if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) { |
339 | 0 | xmlFreeNode(meta); |
340 | 0 | return(-1); |
341 | 0 | } |
342 | | |
343 | 0 | if (head->children == NULL) |
344 | 0 | xmlAddChild(head, meta); |
345 | 0 | else |
346 | 0 | xmlAddPrevSibling(head->children, meta); |
347 | |
|
348 | 0 | return(0); |
349 | 0 | } |
350 | | |
351 | | /** |
352 | | * Determine if a given attribute is a boolean attribute. This |
353 | | * doesn't handle HTML5. |
354 | | * |
355 | | * @deprecated Internal function, don't use. |
356 | | * |
357 | | * @param name the name of the attribute to check |
358 | | * @returns false if the attribute is not boolean, true otherwise. |
359 | | */ |
360 | | int |
361 | | htmlIsBooleanAttr(const xmlChar *name) |
362 | 589k | { |
363 | 589k | const char *str = NULL; |
364 | | |
365 | 589k | if (name == NULL) |
366 | 253 | return(0); |
367 | | |
368 | | /* |
369 | | * These are the HTML attributes which will be output |
370 | | * in minimized form, i.e. `<option selected="selected">` will be |
371 | | * output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output |
372 | | * Method": |
373 | | * |
374 | | * "checked", "compact", "declare", "defer", "disabled", "ismap", |
375 | | * "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", |
376 | | * "selected" |
377 | | * |
378 | | * Additional attributes from HTML5 (not implemented yet): |
379 | | * |
380 | | * "allowfullscreen", "alpha", "async", "autofocus", "autoplay", |
381 | | * "controls", "default", "formnovalidate", "inert", "itemscope", |
382 | | * "loop", "muted", "nomodule", "novalidate", "open", "playsinline", |
383 | | * "required", "reversed", "shadowrootdelegatesfocus", |
384 | | * "shadowrootclonable", "shadowrootserializable", |
385 | | * "shadowrootcustomelementregistry", "truespeed" |
386 | | */ |
387 | | |
388 | 589k | switch (name[0] | 0x20) { |
389 | 37.7k | case 'c': |
390 | 37.7k | name += 1; |
391 | 37.7k | switch (name[0] | 0x20) { |
392 | 5.55k | case 'h': str = "ecked"; break; |
393 | 8.90k | case 'o': str = "mpact"; break; |
394 | 37.7k | } |
395 | 37.7k | break; |
396 | 37.7k | case 'd': |
397 | 23.1k | name += 1; |
398 | 23.1k | switch (name[0] | 0x20) { |
399 | 5.20k | case 'e': |
400 | 5.20k | name += 1; |
401 | 5.20k | switch (name[0] | 0x20) { |
402 | 841 | case 'c': str = "lare"; break; |
403 | 632 | case 'f': str = "er"; break; |
404 | 5.20k | } |
405 | 5.20k | break; |
406 | 5.20k | case 'i': str = "sabled"; break; |
407 | 23.1k | } |
408 | 23.1k | break; |
409 | 80.3k | case 'i': |
410 | 80.3k | str = "smap"; |
411 | 80.3k | break; |
412 | 9.55k | case 'm': |
413 | 9.55k | str = "ultiple"; |
414 | 9.55k | break; |
415 | 9.35k | case 'n': |
416 | 9.35k | name += 1; |
417 | 9.35k | if ((name[0] | 0x20) != 'o') |
418 | 5.46k | break; |
419 | 3.89k | name += 1; |
420 | 3.89k | switch (name[0] | 0x20) { |
421 | 732 | case 'h': str = "ref"; break; |
422 | 687 | case 'r': str = "esize"; break; |
423 | 839 | case 's': str = "hade"; break; |
424 | 536 | case 'w': str = "rap"; break; |
425 | 3.89k | } |
426 | 3.89k | break; |
427 | 5.96k | case 'r': |
428 | 5.96k | str = "eadonly"; |
429 | 5.96k | break; |
430 | 24.4k | case 's': |
431 | 24.4k | str = "elected"; |
432 | 24.4k | break; |
433 | 589k | } |
434 | | |
435 | 589k | if (str == NULL) |
436 | 447k | return(0); |
437 | | |
438 | 141k | return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0); |
439 | 589k | } |
440 | | |
441 | | #ifdef LIBXML_OUTPUT_ENABLED |
442 | | /************************************************************************ |
443 | | * * |
444 | | * Dumping HTML tree content to a simple buffer * |
445 | | * * |
446 | | ************************************************************************/ |
447 | | |
448 | | static xmlParserErrors |
449 | 8.41k | htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) { |
450 | | /* |
451 | | * Fallback to HTML if the encoding is unspecified |
452 | | */ |
453 | 8.41k | if (encoding == NULL) |
454 | 7.53k | encoding = "HTML"; |
455 | | |
456 | 8.41k | return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out)); |
457 | 8.41k | } |
458 | | |
459 | | /** |
460 | | * Serialize an HTML document to an xmlBuf. |
461 | | * |
462 | | * @param buf the xmlBuf output |
463 | | * @param doc the document (unused) |
464 | | * @param cur the current node |
465 | | * @param format should formatting newlines been added |
466 | | * @returns the number of bytes written or -1 in case of error |
467 | | */ |
468 | | static size_t |
469 | | htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED, |
470 | 5.16k | xmlNodePtr cur, int format) { |
471 | 5.16k | size_t use; |
472 | 5.16k | size_t ret; |
473 | 5.16k | xmlOutputBufferPtr outbuf; |
474 | | |
475 | 5.16k | if (cur == NULL) { |
476 | 0 | return ((size_t) -1); |
477 | 0 | } |
478 | 5.16k | if (buf == NULL) { |
479 | 0 | return ((size_t) -1); |
480 | 0 | } |
481 | 5.16k | outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); |
482 | 5.16k | if (outbuf == NULL) |
483 | 1 | return ((size_t) -1); |
484 | 5.16k | memset(outbuf, 0, sizeof(xmlOutputBuffer)); |
485 | 5.16k | outbuf->buffer = buf; |
486 | 5.16k | outbuf->encoder = NULL; |
487 | 5.16k | outbuf->writecallback = NULL; |
488 | 5.16k | outbuf->closecallback = NULL; |
489 | 5.16k | outbuf->context = NULL; |
490 | 5.16k | outbuf->written = 0; |
491 | | |
492 | 5.16k | use = xmlBufUse(buf); |
493 | 5.16k | htmlNodeDumpInternal(outbuf, cur, NULL, format); |
494 | 5.16k | if (outbuf->error) |
495 | 3 | ret = (size_t) -1; |
496 | 5.16k | else |
497 | 5.16k | ret = xmlBufUse(buf) - use; |
498 | 5.16k | xmlFree(outbuf); |
499 | 5.16k | return (ret); |
500 | 5.16k | } |
501 | | |
502 | | /** |
503 | | * Serialize an HTML node to an xmlBuffer. Always uses UTF-8. |
504 | | * |
505 | | * @param buf the HTML buffer output |
506 | | * @param doc the document |
507 | | * @param cur the current node |
508 | | * @returns the number of bytes written or -1 in case of error |
509 | | */ |
510 | | int |
511 | 6.55k | htmlNodeDump(xmlBuffer *buf, xmlDoc *doc, xmlNode *cur) { |
512 | 6.55k | xmlBufPtr buffer; |
513 | 6.55k | size_t ret1; |
514 | 6.55k | int ret2; |
515 | | |
516 | 6.55k | if ((buf == NULL) || (cur == NULL)) |
517 | 1.37k | return(-1); |
518 | | |
519 | 5.17k | xmlInitParser(); |
520 | 5.17k | buffer = xmlBufFromBuffer(buf); |
521 | 5.17k | if (buffer == NULL) |
522 | 3 | return(-1); |
523 | | |
524 | 5.16k | ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1); |
525 | | |
526 | 5.16k | ret2 = xmlBufBackToBuffer(buffer, buf); |
527 | | |
528 | 5.16k | if ((ret1 == (size_t) -1) || (ret2 < 0)) |
529 | 4 | return(-1); |
530 | 5.16k | return(ret1 > INT_MAX ? INT_MAX : ret1); |
531 | 5.16k | } |
532 | | |
533 | | /** |
534 | | * Serialize an HTML node to an xmlBuffer. |
535 | | * |
536 | | * If encoding is NULL, ASCII with HTML 4.0 named character entities |
537 | | * will be used. This is inefficient compared to UTF-8 and might be |
538 | | * changed in a future version. |
539 | | * |
540 | | * @param out the FILE pointer |
541 | | * @param doc the document (unused) |
542 | | * @param cur the current node |
543 | | * @param encoding the document encoding (optional) |
544 | | * @param format should formatting newlines been added |
545 | | * @returns the number of bytes written or -1 in case of failure. |
546 | | */ |
547 | | int |
548 | | htmlNodeDumpFileFormat(FILE *out, xmlDoc *doc ATTRIBUTE_UNUSED, |
549 | 0 | xmlNode *cur, const char *encoding, int format) { |
550 | 0 | xmlOutputBufferPtr buf; |
551 | 0 | xmlCharEncodingHandlerPtr handler; |
552 | 0 | int ret; |
553 | |
|
554 | 0 | xmlInitParser(); |
555 | | |
556 | | /* |
557 | | * save the content to a temp buffer. |
558 | | */ |
559 | 0 | if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) |
560 | 0 | return(-1); |
561 | 0 | buf = xmlOutputBufferCreateFile(out, handler); |
562 | 0 | if (buf == NULL) { |
563 | 0 | xmlCharEncCloseFunc(handler); |
564 | 0 | return(-1); |
565 | 0 | } |
566 | | |
567 | 0 | htmlNodeDumpInternal(buf, cur, NULL, format); |
568 | |
|
569 | 0 | ret = xmlOutputBufferClose(buf); |
570 | 0 | return(ret); |
571 | 0 | } |
572 | | |
573 | | /** |
574 | | * Same as #htmlNodeDumpFileFormat with `format` set to 1 which is |
575 | | * typically undesired. Use of this function is DISCOURAGED in favor |
576 | | * of #htmlNodeDumpFileFormat. |
577 | | * |
578 | | * @param out the FILE pointer |
579 | | * @param doc the document |
580 | | * @param cur the current node |
581 | | */ |
582 | | void |
583 | 0 | htmlNodeDumpFile(FILE *out, xmlDoc *doc, xmlNode *cur) { |
584 | 0 | htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); |
585 | 0 | } |
586 | | |
587 | | /** |
588 | | * Serialize an HTML node to a memory, also returning the size of |
589 | | * the result. It's up to the caller to free the memory. |
590 | | * |
591 | | * Uses the encoding of the document. If the document has no |
592 | | * encoding, ASCII with HTML 4.0 named character entities will |
593 | | * be used. This is inefficient compared to UTF-8 and might be |
594 | | * changed in a future version. |
595 | | * |
596 | | * @param cur the document |
597 | | * @param mem OUT: the memory pointer |
598 | | * @param size OUT: the memory length |
599 | | * @param format should formatting newlines been added |
600 | | */ |
601 | | void |
602 | 9.08k | htmlDocDumpMemoryFormat(xmlDoc *cur, xmlChar**mem, int *size, int format) { |
603 | 9.08k | xmlOutputBufferPtr buf; |
604 | 9.08k | xmlCharEncodingHandlerPtr handler = NULL; |
605 | | |
606 | 9.08k | xmlInitParser(); |
607 | | |
608 | 9.08k | if ((mem == NULL) || (size == NULL)) |
609 | 0 | return; |
610 | 9.08k | *mem = NULL; |
611 | 9.08k | *size = 0; |
612 | 9.08k | if (cur == NULL) |
613 | 663 | return; |
614 | | |
615 | 8.41k | if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) |
616 | 3 | return; |
617 | 8.41k | buf = xmlAllocOutputBuffer(handler); |
618 | 8.41k | if (buf == NULL) { |
619 | 3 | xmlCharEncCloseFunc(handler); |
620 | 3 | return; |
621 | 3 | } |
622 | | |
623 | 8.41k | htmlDocContentDumpFormatOutput(buf, cur, NULL, format); |
624 | | |
625 | 8.41k | xmlOutputBufferFlush(buf); |
626 | | |
627 | 8.41k | if (!buf->error) { |
628 | 8.04k | if (buf->conv != NULL) { |
629 | 7.83k | *size = xmlBufUse(buf->conv); |
630 | 7.83k | *mem = xmlStrndup(xmlBufContent(buf->conv), *size); |
631 | 7.83k | } else { |
632 | 207 | *size = xmlBufUse(buf->buffer); |
633 | 207 | *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); |
634 | 207 | } |
635 | 8.04k | } |
636 | | |
637 | 8.41k | xmlOutputBufferClose(buf); |
638 | 8.41k | } |
639 | | |
640 | | /** |
641 | | * Same as #htmlDocDumpMemoryFormat with `format` set to 1 which |
642 | | * is typically undesired. Also see the warnings there. Use of |
643 | | * this function is DISCOURAGED in favor of |
644 | | * #htmlDocContentDumpFormatOutput. |
645 | | * |
646 | | * @param cur the document |
647 | | * @param mem OUT: the memory pointer |
648 | | * @param size OUT: the memory length |
649 | | */ |
650 | | void |
651 | 4.81k | htmlDocDumpMemory(xmlDoc *cur, xmlChar**mem, int *size) { |
652 | 4.81k | htmlDocDumpMemoryFormat(cur, mem, size, 1); |
653 | 4.81k | } |
654 | | |
655 | | |
656 | | /************************************************************************ |
657 | | * * |
658 | | * Dumping HTML tree content to an I/O output buffer * |
659 | | * * |
660 | | ************************************************************************/ |
661 | | |
662 | | /** |
663 | | * Serialize the HTML document's DTD, if any. |
664 | | * |
665 | | * Ignores `encoding` and uses the encoding of the output buffer. |
666 | | * |
667 | | * @param buf the HTML buffer output |
668 | | * @param doc the document |
669 | | * @param encoding the encoding string (unused) |
670 | | */ |
671 | | static void |
672 | | htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
673 | 19.4k | const char *encoding ATTRIBUTE_UNUSED) { |
674 | 19.4k | xmlDtdPtr cur = doc->intSubset; |
675 | | |
676 | 19.4k | if (cur == NULL) |
677 | 0 | return; |
678 | 19.4k | xmlOutputBufferWrite(buf, 10, "<!DOCTYPE "); |
679 | 19.4k | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
680 | 19.4k | if (cur->ExternalID != NULL) { |
681 | 14.2k | xmlOutputBufferWrite(buf, 8, " PUBLIC "); |
682 | 14.2k | xmlOutputBufferWriteQuotedString(buf, cur->ExternalID); |
683 | 14.2k | if (cur->SystemID != NULL) { |
684 | 13.8k | xmlOutputBufferWrite(buf, 1, " "); |
685 | 13.8k | xmlOutputBufferWriteQuotedString(buf, cur->SystemID); |
686 | 13.8k | } |
687 | 14.2k | } else if (cur->SystemID != NULL && |
688 | 5.16k | xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { |
689 | 4.23k | xmlOutputBufferWrite(buf, 8, " SYSTEM "); |
690 | 4.23k | xmlOutputBufferWriteQuotedString(buf, cur->SystemID); |
691 | 4.23k | } |
692 | 19.4k | xmlOutputBufferWrite(buf, 2, ">\n"); |
693 | 19.4k | } |
694 | | |
695 | | static void |
696 | 3.60k | htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) { |
697 | 3.60k | const xmlChar *tmp = content; |
698 | | |
699 | | /* |
700 | | * See appendix "B.2.1 Non-ASCII characters in URI attribute |
701 | | * values" in the HTML 4.01 spec. This is also recommended |
702 | | * by the HTML output method of the XSLT 1.0 spec. |
703 | | * |
704 | | * We also escape space and control chars. |
705 | | */ |
706 | | |
707 | | /* Skip over initial whitespace */ |
708 | 5.02k | while (IS_WS_HTML(*tmp)) tmp++; |
709 | 3.60k | if (tmp > content) { |
710 | 532 | xmlOutputBufferWrite(buf, tmp - content, (char *) content); |
711 | 532 | content = tmp; |
712 | 532 | } |
713 | | |
714 | 16.7M | while (1) { |
715 | 16.7M | char escbuf[3]; |
716 | 16.7M | const char *repl; |
717 | 16.7M | int replSize; |
718 | 16.7M | int c = *tmp; |
719 | | |
720 | 16.8M | while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) { |
721 | 85.7k | tmp += 1; |
722 | 85.7k | c = *tmp; |
723 | 85.7k | } |
724 | | |
725 | 16.7M | if (tmp > content) |
726 | 10.4k | xmlOutputBufferWrite(buf, tmp - content, (char *) content); |
727 | | |
728 | 16.7M | if ((c <= 0x20) || (c >= 0x7F)) { |
729 | 16.7M | static const char hex[16] = { |
730 | 16.7M | '0', '1', '2', '3', '4', '5', '6', '7', |
731 | 16.7M | '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' |
732 | 16.7M | }; |
733 | | |
734 | 16.7M | if (c == 0) |
735 | 3.60k | break; |
736 | | |
737 | 16.7M | escbuf[0] = '%'; |
738 | 16.7M | escbuf[1] = hex[(c >> 4) & 0x0F]; |
739 | 16.7M | escbuf[2] = hex[c & 0x0F]; |
740 | 16.7M | repl = escbuf; |
741 | 16.7M | replSize = 3; |
742 | 16.7M | } else if (c == '"') { |
743 | 961 | repl = """; |
744 | 961 | replSize = 6; |
745 | 1.35k | } else { |
746 | 1.35k | repl = "&"; |
747 | 1.35k | replSize = 5; |
748 | 1.35k | } |
749 | | |
750 | 16.7M | xmlOutputBufferWrite(buf, replSize, repl); |
751 | 16.7M | tmp += 1; |
752 | 16.7M | content = tmp; |
753 | 16.7M | } |
754 | 3.60k | } |
755 | | |
756 | | /** |
757 | | * Serialize an HTML attribute. |
758 | | * |
759 | | * @param buf the HTML buffer output |
760 | | * @param cur the attribute pointer |
761 | | */ |
762 | | static void |
763 | 308k | htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) { |
764 | 308k | xmlOutputBufferWrite(buf, 1, " "); |
765 | | |
766 | 308k | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
767 | 10.2k | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
768 | 10.2k | xmlOutputBufferWrite(buf, 1, ":"); |
769 | 10.2k | } |
770 | 308k | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
771 | | |
772 | | /* |
773 | | * The HTML5 spec requires to always serialize empty attribute |
774 | | * values as `=""`. We should probably align with HTML5 at some |
775 | | * point. |
776 | | */ |
777 | 308k | if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { |
778 | 98.3k | xmlNodePtr child; |
779 | 98.3k | int isUri; |
780 | | |
781 | 98.3k | xmlOutputBufferWrite(buf, 2, "=\""); |
782 | | |
783 | | /* |
784 | | * Special handling of URIs doesn't conform to HTML5 and |
785 | | * should probably be removed at some point. |
786 | | */ |
787 | 98.3k | isUri = (cur->ns == NULL) && (cur->parent != NULL) && |
788 | 98.3k | (cur->parent->ns == NULL) && |
789 | 98.3k | ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || |
790 | 87.1k | (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || |
791 | 87.1k | (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || |
792 | 87.1k | ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && |
793 | 83.9k | (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a")))); |
794 | | |
795 | 199k | for (child = cur->children; child != NULL; child = child->next) { |
796 | 101k | if (child->type == XML_TEXT_NODE) { |
797 | 99.6k | const xmlChar *content = child->content; |
798 | | |
799 | 99.6k | if (content == NULL) |
800 | 437 | continue; |
801 | | |
802 | 99.2k | if (isUri) { |
803 | 3.60k | htmlSerializeUri(buf, content); |
804 | 95.6k | } else { |
805 | 95.6k | xmlSerializeText(buf, content, SIZE_MAX, |
806 | 95.6k | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
807 | 95.6k | } |
808 | 99.2k | } else if (child->type == XML_ENTITY_REF_NODE) { |
809 | | /* TODO: We should probably expand entity refs */ |
810 | 1.39k | xmlOutputBufferWrite(buf, 1, "&"); |
811 | 1.39k | xmlOutputBufferWriteString(buf, (char *) child->name); |
812 | 1.39k | xmlOutputBufferWrite(buf, 1, ";"); |
813 | 1.39k | } |
814 | 101k | } |
815 | | |
816 | 98.3k | xmlOutputBufferWrite(buf, 1, "\""); |
817 | 98.3k | } |
818 | 308k | } |
819 | | |
820 | | /** |
821 | | * Serialize an HTML node to an output buffer. |
822 | | * |
823 | | * If `encoding` is specified, it is used to create or update meta |
824 | | * tags containing the character encoding. |
825 | | * |
826 | | * @param buf the HTML buffer output |
827 | | * @param cur the current node |
828 | | * @param encoding the encoding string (optional) |
829 | | * @param format should formatting newlines been added |
830 | | */ |
831 | | void |
832 | | htmlNodeDumpInternal(xmlOutputBuffer *buf, xmlNode *cur, |
833 | 42.1k | const char *encoding, int format) { |
834 | 42.1k | xmlNodePtr root, parent, metaHead = NULL; |
835 | 42.1k | xmlAttrPtr attr; |
836 | 42.1k | const htmlElemDesc * info; |
837 | 42.1k | int isRaw = 0; |
838 | | |
839 | 42.1k | xmlInitParser(); |
840 | | |
841 | 42.1k | if ((cur == NULL) || (buf == NULL)) { |
842 | 2.11k | return; |
843 | 2.11k | } |
844 | | |
845 | 39.9k | root = cur; |
846 | 39.9k | parent = cur->parent; |
847 | 579k | while (1) { |
848 | 579k | switch (cur->type) { |
849 | 22.3k | case XML_HTML_DOCUMENT_NODE: |
850 | 26.8k | case XML_DOCUMENT_NODE: |
851 | 26.8k | if (((xmlDocPtr) cur)->intSubset != NULL) { |
852 | 19.4k | htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); |
853 | 19.4k | } |
854 | 26.8k | if (cur->children != NULL) { |
855 | | /* Always validate cur->parent when descending. */ |
856 | 24.2k | if (cur->parent == parent) { |
857 | 24.2k | parent = cur; |
858 | 24.2k | cur = cur->children; |
859 | 24.2k | continue; |
860 | 24.2k | } |
861 | 24.2k | } else { |
862 | 2.54k | xmlOutputBufferWrite(buf, 1, "\n"); |
863 | 2.54k | } |
864 | 2.54k | break; |
865 | | |
866 | 333k | case XML_ELEMENT_NODE: { |
867 | 333k | htmlMetaEncoding menc; |
868 | 333k | int isMeta = 0; |
869 | 333k | int addMeta = 0; |
870 | | |
871 | | /* |
872 | | * Some users like lxml are known to pass nodes with a corrupted |
873 | | * tree structure. Fall back to a recursive call to handle this |
874 | | * case. |
875 | | */ |
876 | 333k | if ((cur->parent != parent) && (cur->children != NULL)) { |
877 | 0 | htmlNodeDumpInternal(buf, cur, encoding, format); |
878 | 0 | break; |
879 | 0 | } |
880 | | |
881 | | /* |
882 | | * Get specific HTML info for that node. |
883 | | */ |
884 | 333k | if (cur->ns == NULL) |
885 | 293k | info = htmlTagLookup(cur->name); |
886 | 40.1k | else |
887 | 40.1k | info = NULL; |
888 | | |
889 | 333k | if (encoding != NULL) { |
890 | 0 | isMeta = htmlParseMetaEncoding(cur, &menc); |
891 | | |
892 | | /* |
893 | | * Don't add meta tag for "HTML" encoding. |
894 | | */ |
895 | 0 | if ((xmlStrcasecmp(BAD_CAST encoding, |
896 | 0 | BAD_CAST "HTML") != 0) && |
897 | 0 | (xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) && |
898 | 0 | (parent != NULL) && |
899 | 0 | (xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) && |
900 | 0 | (parent->parent != NULL) && |
901 | 0 | (parent->parent->parent == NULL) && |
902 | 0 | (metaHead == NULL)) { |
903 | 0 | xmlNodePtr n; |
904 | |
|
905 | 0 | metaHead = cur; |
906 | 0 | addMeta = 1; |
907 | |
|
908 | 0 | for (n = cur->children; n != NULL; n = n->next) { |
909 | 0 | int unused; |
910 | |
|
911 | 0 | if (htmlFindMetaEncodingAttr(n, &unused) != NULL) { |
912 | 0 | metaHead = NULL; |
913 | 0 | addMeta = 0; |
914 | 0 | break; |
915 | 0 | } |
916 | 0 | } |
917 | 0 | } |
918 | 0 | } |
919 | | |
920 | 333k | xmlOutputBufferWrite(buf, 1, "<"); |
921 | 333k | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
922 | 36.6k | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
923 | 36.6k | xmlOutputBufferWrite(buf, 1, ":"); |
924 | 36.6k | } |
925 | 333k | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
926 | 333k | if (cur->nsDef) |
927 | 23.4k | xmlNsListDumpOutput(buf, cur->nsDef); |
928 | 333k | attr = cur->properties; |
929 | 640k | while (attr != NULL) { |
930 | 306k | if ((!isMeta) || (attr != menc.attr)) { |
931 | 306k | htmlAttrDumpOutput(buf, attr); |
932 | 306k | } else { |
933 | 0 | xmlOutputBufferWrite(buf, 1, " "); |
934 | 0 | xmlOutputBufferWriteString(buf, (char *) attr->name); |
935 | |
|
936 | 0 | xmlOutputBufferWrite(buf, 2, "=\""); |
937 | 0 | xmlSerializeText(buf, menc.attrValue, menc.off.start, |
938 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
939 | 0 | xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
940 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
941 | 0 | xmlSerializeText(buf, menc.attrValue + menc.off.end, |
942 | 0 | menc.off.size - menc.off.end, |
943 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
944 | 0 | xmlOutputBufferWrite(buf, 1, "\""); |
945 | 0 | } |
946 | 306k | attr = attr->next; |
947 | 306k | } |
948 | | |
949 | 333k | if ((info != NULL) && (info->empty)) { |
950 | 9.45k | xmlOutputBufferWrite(buf, 1, ">"); |
951 | 323k | } else if (cur->children == NULL) { |
952 | 35.0k | if (addMeta) { |
953 | 0 | xmlOutputBufferWrite(buf, 16, "><meta charset=\""); |
954 | 0 | xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
955 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
956 | 0 | xmlOutputBufferWrite(buf, 4, "\"></"); |
957 | 35.0k | } else { |
958 | 35.0k | xmlOutputBufferWrite(buf, 3, "></"); |
959 | 35.0k | } |
960 | 35.0k | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
961 | 855 | xmlOutputBufferWriteString(buf, |
962 | 855 | (const char *)cur->ns->prefix); |
963 | 855 | xmlOutputBufferWrite(buf, 1, ":"); |
964 | 855 | } |
965 | 35.0k | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
966 | 35.0k | xmlOutputBufferWrite(buf, 1, ">"); |
967 | 288k | } else { |
968 | 288k | xmlOutputBufferWrite(buf, 1, ">"); |
969 | 288k | if ((format) && |
970 | 288k | ((addMeta) || |
971 | 281k | ((info != NULL) && (!info->isinline) && |
972 | 281k | (cur->children->type != HTML_TEXT_NODE) && |
973 | 281k | (cur->children->type != HTML_ENTITY_REF_NODE) && |
974 | 281k | (cur->children != cur->last) && |
975 | 281k | (cur->name != NULL) && |
976 | 281k | (cur->name[0] != 'p')))) /* p, pre, param */ |
977 | 2.20k | xmlOutputBufferWrite(buf, 1, "\n"); |
978 | 288k | if (addMeta) { |
979 | 0 | xmlOutputBufferWrite(buf, 15, "<meta charset=\""); |
980 | 0 | xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
981 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
982 | 0 | xmlOutputBufferWrite(buf, 2, "\">"); |
983 | 0 | if ((format) && |
984 | 0 | (cur->children->type != HTML_TEXT_NODE) && |
985 | 0 | (cur->children->type != HTML_ENTITY_REF_NODE)) |
986 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
987 | 0 | } |
988 | | |
989 | 288k | if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT)) |
990 | 2.50k | isRaw = 1; |
991 | | |
992 | 288k | parent = cur; |
993 | 288k | cur = cur->children; |
994 | 288k | continue; |
995 | 288k | } |
996 | | |
997 | 44.5k | if ((format) && (cur->next != NULL) && |
998 | 44.5k | (info != NULL) && (!info->isinline)) { |
999 | 14.0k | if ((cur->next->type != HTML_TEXT_NODE) && |
1000 | 14.0k | (cur->next->type != HTML_ENTITY_REF_NODE) && |
1001 | 14.0k | (parent != NULL) && |
1002 | 14.0k | (parent->name != NULL) && |
1003 | 14.0k | (parent->name[0] != 'p')) /* p, pre, param */ |
1004 | 5.21k | xmlOutputBufferWrite(buf, 1, "\n"); |
1005 | 14.0k | } |
1006 | | |
1007 | 44.5k | break; |
1008 | 333k | } |
1009 | | |
1010 | 1.80k | case XML_ATTRIBUTE_NODE: |
1011 | 1.80k | htmlAttrDumpOutput(buf, (xmlAttrPtr) cur); |
1012 | 1.80k | break; |
1013 | | |
1014 | 138k | case HTML_TEXT_NODE: |
1015 | 138k | if (cur->content == NULL) |
1016 | 633 | break; |
1017 | 137k | if ((cur->name == (const xmlChar *)xmlStringTextNoenc) || |
1018 | 137k | (isRaw)) { |
1019 | 417 | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
1020 | 137k | } else { |
1021 | 137k | xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML); |
1022 | 137k | } |
1023 | 137k | break; |
1024 | | |
1025 | 48.5k | case HTML_COMMENT_NODE: |
1026 | 48.5k | if (cur->content != NULL) { |
1027 | 48.3k | xmlOutputBufferWrite(buf, 4, "<!--"); |
1028 | 48.3k | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
1029 | 48.3k | xmlOutputBufferWrite(buf, 3, "-->"); |
1030 | 48.3k | } |
1031 | 48.5k | break; |
1032 | | |
1033 | 1.55k | case HTML_PI_NODE: |
1034 | 1.55k | if (cur->name != NULL) { |
1035 | 1.47k | xmlOutputBufferWrite(buf, 2, "<?"); |
1036 | 1.47k | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
1037 | 1.47k | if (cur->content != NULL) { |
1038 | 1.03k | xmlOutputBufferWrite(buf, 1, " "); |
1039 | 1.03k | xmlOutputBufferWriteString(buf, |
1040 | 1.03k | (const char *)cur->content); |
1041 | 1.03k | } |
1042 | 1.47k | xmlOutputBufferWrite(buf, 1, ">"); |
1043 | 1.47k | } |
1044 | 1.55k | break; |
1045 | | |
1046 | 3.95k | case HTML_ENTITY_REF_NODE: |
1047 | 3.95k | xmlOutputBufferWrite(buf, 1, "&"); |
1048 | 3.95k | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
1049 | 3.95k | xmlOutputBufferWrite(buf, 1, ";"); |
1050 | 3.95k | break; |
1051 | | |
1052 | 2.53k | case HTML_PRESERVE_NODE: |
1053 | 2.53k | if (cur->content != NULL) { |
1054 | 2.32k | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
1055 | 2.32k | } |
1056 | 2.53k | break; |
1057 | | |
1058 | 22.0k | default: |
1059 | 22.0k | break; |
1060 | 579k | } |
1061 | | |
1062 | 579k | while (1) { |
1063 | 579k | if (cur == root) |
1064 | 39.9k | return; |
1065 | 539k | if (cur->next != NULL) { |
1066 | 225k | cur = cur->next; |
1067 | 225k | break; |
1068 | 225k | } |
1069 | | |
1070 | 313k | isRaw = 0; |
1071 | | |
1072 | 313k | cur = parent; |
1073 | | /* cur->parent was validated when descending. */ |
1074 | 313k | parent = cur->parent; |
1075 | | |
1076 | 313k | if ((cur->type == XML_HTML_DOCUMENT_NODE) || |
1077 | 313k | (cur->type == XML_DOCUMENT_NODE)) { |
1078 | 24.2k | xmlOutputBufferWrite(buf, 1, "\n"); |
1079 | 288k | } else { |
1080 | 288k | if ((format) && (cur->ns == NULL)) |
1081 | 245k | info = htmlTagLookup(cur->name); |
1082 | 43.8k | else |
1083 | 43.8k | info = NULL; |
1084 | | |
1085 | 288k | if ((format) && (info != NULL) && (!info->isinline) && |
1086 | 288k | (cur->last->type != HTML_TEXT_NODE) && |
1087 | 288k | (cur->last->type != HTML_ENTITY_REF_NODE) && |
1088 | 288k | ((cur->children != cur->last) || (cur == metaHead)) && |
1089 | 288k | (cur->name != NULL) && |
1090 | 288k | (cur->name[0] != 'p')) /* p, pre, param */ |
1091 | 5.29k | xmlOutputBufferWrite(buf, 1, "\n"); |
1092 | | |
1093 | 288k | xmlOutputBufferWrite(buf, 2, "</"); |
1094 | 288k | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
1095 | 35.8k | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
1096 | 35.8k | xmlOutputBufferWrite(buf, 1, ":"); |
1097 | 35.8k | } |
1098 | 288k | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
1099 | 288k | xmlOutputBufferWrite(buf, 1, ">"); |
1100 | | |
1101 | 288k | if ((format) && (info != NULL) && (!info->isinline) && |
1102 | 288k | (cur->next != NULL)) { |
1103 | 7.84k | if ((cur->next->type != HTML_TEXT_NODE) && |
1104 | 7.84k | (cur->next->type != HTML_ENTITY_REF_NODE) && |
1105 | 7.84k | (parent != NULL) && |
1106 | 7.84k | (parent->name != NULL) && |
1107 | 7.84k | (parent->name[0] != 'p')) /* p, pre, param */ |
1108 | 2.25k | xmlOutputBufferWrite(buf, 1, "\n"); |
1109 | 7.84k | } |
1110 | | |
1111 | 288k | if (cur == metaHead) |
1112 | 0 | metaHead = NULL; |
1113 | 288k | } |
1114 | 313k | } |
1115 | 265k | } |
1116 | 39.9k | } |
1117 | | |
1118 | | /** |
1119 | | * Serialize an HTML node to an output buffer. |
1120 | | * |
1121 | | * @param buf the HTML buffer output |
1122 | | * @param doc the document (unused) |
1123 | | * @param cur the current node |
1124 | | * @param encoding the encoding string (unused) |
1125 | | * @param format should formatting newlines been added |
1126 | | */ |
1127 | | void |
1128 | | htmlNodeDumpFormatOutput(xmlOutputBuffer *buf, |
1129 | | xmlDoc *doc ATTRIBUTE_UNUSED, xmlNode *cur, |
1130 | 3.20k | const char *encoding ATTRIBUTE_UNUSED, int format) { |
1131 | 3.20k | htmlNodeDumpInternal(buf, cur, NULL, format); |
1132 | 3.20k | } |
1133 | | |
1134 | | /** |
1135 | | * Same as #htmlNodeDumpFormatOutput with `format` set to 1 which is |
1136 | | * typically undesired. Use of this function is DISCOURAGED in favor |
1137 | | * of #htmlNodeDumpFormatOutput. |
1138 | | * |
1139 | | * @param buf the HTML buffer output |
1140 | | * @param doc the document (unused) |
1141 | | * @param cur the current node |
1142 | | * @param encoding the encoding string (unused) |
1143 | | */ |
1144 | | void |
1145 | | htmlNodeDumpOutput(xmlOutputBuffer *buf, xmlDoc *doc ATTRIBUTE_UNUSED, |
1146 | 5.73k | xmlNode *cur, const char *encoding ATTRIBUTE_UNUSED) { |
1147 | 5.73k | htmlNodeDumpInternal(buf, cur, NULL, 1); |
1148 | 5.73k | } |
1149 | | |
1150 | | /** |
1151 | | * Serialize an HTML document to an output buffer. |
1152 | | * |
1153 | | * @param buf the HTML buffer output |
1154 | | * @param cur the document |
1155 | | * @param encoding the encoding string (unused) |
1156 | | * @param format should formatting newlines been added |
1157 | | */ |
1158 | | void |
1159 | | htmlDocContentDumpFormatOutput(xmlOutputBuffer *buf, xmlDoc *cur, |
1160 | | const char *encoding ATTRIBUTE_UNUSED, |
1161 | 11.0k | int format) { |
1162 | 11.0k | htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format); |
1163 | 11.0k | } |
1164 | | |
1165 | | /** |
1166 | | * Same as #htmlDocContentDumpFormatOutput with `format` set to 1 |
1167 | | * which is typically undesired. Use of this function is DISCOURAGED |
1168 | | * in favor of #htmlDocContentDumpFormatOutput. |
1169 | | * |
1170 | | * @param buf the HTML buffer output |
1171 | | * @param cur the document |
1172 | | * @param encoding the encoding string (unused) |
1173 | | */ |
1174 | | void |
1175 | | htmlDocContentDumpOutput(xmlOutputBuffer *buf, xmlDoc *cur, |
1176 | 14.6k | const char *encoding ATTRIBUTE_UNUSED) { |
1177 | 14.6k | htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1); |
1178 | 14.6k | } |
1179 | | |
1180 | | /************************************************************************ |
1181 | | * * |
1182 | | * Saving functions front-ends * |
1183 | | * * |
1184 | | ************************************************************************/ |
1185 | | |
1186 | | /** |
1187 | | * Serialize an HTML document to an open `FILE`. |
1188 | | * |
1189 | | * Uses the encoding of the document. If the document has no |
1190 | | * encoding, ASCII with HTML 4.0 named character entities will |
1191 | | * be used. This is inefficient compared to UTF-8 and might be |
1192 | | * changed in a future version. |
1193 | | * |
1194 | | * Enables "formatting" unconditionally which is typically |
1195 | | * undesired. |
1196 | | * |
1197 | | * Use of this function is DISCOURAGED in favor of |
1198 | | * #htmlNodeDumpFileFormat. |
1199 | | * |
1200 | | * @param f the FILE* |
1201 | | * @param cur the document |
1202 | | * @returns the number of bytes written or -1 in case of failure. |
1203 | | */ |
1204 | | int |
1205 | 0 | htmlDocDump(FILE *f, xmlDoc *cur) { |
1206 | 0 | xmlOutputBufferPtr buf; |
1207 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
1208 | 0 | int ret; |
1209 | |
|
1210 | 0 | xmlInitParser(); |
1211 | |
|
1212 | 0 | if ((cur == NULL) || (f == NULL)) { |
1213 | 0 | return(-1); |
1214 | 0 | } |
1215 | | |
1216 | 0 | if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) |
1217 | 0 | return(-1); |
1218 | 0 | buf = xmlOutputBufferCreateFile(f, handler); |
1219 | 0 | if (buf == NULL) { |
1220 | 0 | xmlCharEncCloseFunc(handler); |
1221 | 0 | return(-1); |
1222 | 0 | } |
1223 | 0 | htmlDocContentDumpOutput(buf, cur, NULL); |
1224 | |
|
1225 | 0 | ret = xmlOutputBufferClose(buf); |
1226 | 0 | return(ret); |
1227 | 0 | } |
1228 | | |
1229 | | /** |
1230 | | * Serialize an HTML document to a file. |
1231 | | * |
1232 | | * Same as #htmlSaveFileFormat with `encoding` set to NULL and |
1233 | | * `format` set to 1 which is typically undesired. |
1234 | | * |
1235 | | * Use of this function is DISCOURAGED in favor of |
1236 | | * #htmlSaveFileFormat. |
1237 | | * |
1238 | | * @param filename the filename (or URL) |
1239 | | * @param cur the document |
1240 | | * @returns the number of bytes written or -1 in case of failure. |
1241 | | */ |
1242 | | int |
1243 | 0 | htmlSaveFile(const char *filename, xmlDoc *cur) { |
1244 | 0 | return(htmlSaveFileFormat(filename, cur, NULL, 1)); |
1245 | 0 | } |
1246 | | |
1247 | | /** |
1248 | | * Serialize an HTML document to a file using a given encoding. |
1249 | | * |
1250 | | * If `filename` is `"-"`, stdout is used. This is potentially |
1251 | | * insecure and might be changed in a future version. |
1252 | | * |
1253 | | * If encoding is NULL, ASCII with HTML 4.0 named character entities |
1254 | | * will be used. This is inefficient compared to UTF-8 and might be |
1255 | | * changed in a future version. |
1256 | | * |
1257 | | * Sets or updates meta tags containing the character encoding. |
1258 | | * |
1259 | | * @param filename the filename |
1260 | | * @param cur the document |
1261 | | * @param format should formatting newlines been added |
1262 | | * @param encoding the document encoding (optional) |
1263 | | * @returns the number of bytes written or -1 in case of failure. |
1264 | | */ |
1265 | | int |
1266 | | htmlSaveFileFormat(const char *filename, xmlDoc *cur, |
1267 | 0 | const char *encoding, int format) { |
1268 | 0 | xmlOutputBufferPtr buf; |
1269 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
1270 | 0 | int ret; |
1271 | |
|
1272 | 0 | if ((cur == NULL) || (filename == NULL)) |
1273 | 0 | return(-1); |
1274 | | |
1275 | 0 | xmlInitParser(); |
1276 | |
|
1277 | 0 | if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) |
1278 | 0 | return(-1); |
1279 | | |
1280 | | /* |
1281 | | * save the content to a temp buffer. |
1282 | | */ |
1283 | 0 | buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); |
1284 | 0 | if (buf == NULL) { |
1285 | 0 | xmlCharEncCloseFunc(handler); |
1286 | 0 | return(0); |
1287 | 0 | } |
1288 | | |
1289 | 0 | htmlDocContentDumpFormatOutput(buf, cur, encoding, format); |
1290 | |
|
1291 | 0 | ret = xmlOutputBufferClose(buf); |
1292 | 0 | return(ret); |
1293 | 0 | } |
1294 | | |
1295 | | /** |
1296 | | * Serialize an HTML document to a file. |
1297 | | * |
1298 | | * Same as #htmlSaveFileFormat with `format` set to 1 which is |
1299 | | * typically undesired. Also see the warnings there. Use of this |
1300 | | * function is DISCOURAGED in favor of #htmlSaveFileFormat. |
1301 | | * |
1302 | | * @param filename the filename |
1303 | | * @param cur the document |
1304 | | * @param encoding the document encoding |
1305 | | * @returns the number of bytes written or -1 in case of failure. |
1306 | | */ |
1307 | | int |
1308 | 0 | htmlSaveFileEnc(const char *filename, xmlDoc *cur, const char *encoding) { |
1309 | 0 | return(htmlSaveFileFormat(filename, cur, encoding, 1)); |
1310 | 0 | } |
1311 | | |
1312 | | #endif /* LIBXML_OUTPUT_ENABLED */ |
1313 | | |
1314 | | #endif /* LIBXML_HTML_ENABLED */ |