Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * HTMLtree.c : implementation of access function for an HTML tree. |
3 | | * |
4 | | * See Copyright for the status of this software. |
5 | | * |
6 | | * Author: Daniel Veillard |
7 | | */ |
8 | | |
9 | | |
10 | | #define IN_LIBXML |
11 | | #include "libxml.h" |
12 | | #ifdef LIBXML_HTML_ENABLED |
13 | | |
14 | | #include <string.h> /* for memset() only ! */ |
15 | | #include <ctype.h> |
16 | | #include <stdlib.h> |
17 | | |
18 | | #include <libxml/xmlmemory.h> |
19 | | #include <libxml/HTMLparser.h> |
20 | | #include <libxml/HTMLtree.h> |
21 | | #include <libxml/entities.h> |
22 | | #include <libxml/xmlerror.h> |
23 | | #include <libxml/parserInternals.h> |
24 | | #include <libxml/uri.h> |
25 | | |
26 | | #include "private/buf.h" |
27 | | #include "private/html.h" |
28 | | #include "private/error.h" |
29 | | #include "private/html.h" |
30 | | #include "private/io.h" |
31 | | #include "private/save.h" |
32 | | #include "private/tree.h" |
33 | | |
34 | | /************************************************************************ |
35 | | * * |
36 | | * Getting/Setting encoding meta tags * |
37 | | * * |
38 | | ************************************************************************/ |
39 | | |
40 | | typedef struct { |
41 | | xmlAttrPtr attr; /* charset or content */ |
42 | | const xmlChar *attrValue; |
43 | | htmlMetaEncodingOffsets off; |
44 | | } htmlMetaEncoding; |
45 | | |
46 | | static htmlNodePtr |
47 | 0 | htmlFindFirstChild(htmlNodePtr parent, const char *name) { |
48 | 0 | htmlNodePtr child; |
49 | |
|
50 | 0 | for (child = parent->children; child != NULL; child = child->next) { |
51 | 0 | if ((child->type == XML_ELEMENT_NODE) && |
52 | 0 | (xmlStrcasecmp(child->name, BAD_CAST name) == 0)) |
53 | 0 | return(child); |
54 | 0 | } |
55 | | |
56 | 0 | return(NULL); |
57 | 0 | } |
58 | | |
59 | | static htmlNodePtr |
60 | 0 | htmlFindHead(htmlDocPtr doc) { |
61 | 0 | htmlNodePtr html; |
62 | |
|
63 | 0 | if (doc == NULL) |
64 | 0 | return(NULL); |
65 | | |
66 | 0 | html = htmlFindFirstChild((htmlNodePtr) doc, "html"); |
67 | 0 | if (html == NULL) |
68 | 0 | return(NULL); |
69 | | |
70 | 0 | return(htmlFindFirstChild(html, "head")); |
71 | 0 | } |
72 | | |
73 | | int |
74 | 0 | htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) { |
75 | 0 | const xmlChar *p = val; |
76 | |
|
77 | 0 | while (1) { |
78 | 0 | size_t start, end; |
79 | |
|
80 | 0 | while ((*p != 'c') && (*p != 'C')) { |
81 | 0 | if (*p == 0) |
82 | 0 | return(0); |
83 | 0 | p += 1; |
84 | 0 | } |
85 | 0 | p += 1; |
86 | |
|
87 | 0 | if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0) |
88 | 0 | continue; |
89 | | |
90 | 0 | p += 6; |
91 | 0 | while (IS_WS_HTML(*p)) p += 1; |
92 | |
|
93 | 0 | if (*p != '=') |
94 | 0 | continue; |
95 | | |
96 | 0 | p += 1; |
97 | 0 | while (IS_WS_HTML(*p)) p += 1; |
98 | |
|
99 | 0 | if (*p == 0) |
100 | 0 | return(0); |
101 | | |
102 | 0 | if ((*p == '"') || (*p == '\'')) { |
103 | 0 | int quote = *p; |
104 | |
|
105 | 0 | p += 1; |
106 | 0 | while (IS_WS_HTML(*p)) p += 1; |
107 | |
|
108 | 0 | start = p - val; |
109 | 0 | end = start; |
110 | |
|
111 | 0 | while (*p != quote) { |
112 | 0 | if (*p == 0) |
113 | 0 | return(0); |
114 | 0 | if (!IS_WS_HTML(*p)) |
115 | 0 | end = p + 1 - val; |
116 | 0 | p += 1; |
117 | 0 | } |
118 | 0 | } else { |
119 | 0 | start = p - val; |
120 | |
|
121 | 0 | while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p))) |
122 | 0 | p += 1; |
123 | |
|
124 | 0 | end = p - val; |
125 | 0 | } |
126 | | |
127 | 0 | off->start = start; |
128 | 0 | off->end = end; |
129 | 0 | off->size = p - val + strlen((char *) p); |
130 | |
|
131 | 0 | return(1); |
132 | 0 | } |
133 | | |
134 | 0 | return(0); |
135 | 0 | } |
136 | | |
137 | | static xmlAttrPtr |
138 | 0 | htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) { |
139 | 0 | xmlAttrPtr attr, contentAttr = NULL; |
140 | 0 | int isContentType = 0; |
141 | |
|
142 | 0 | if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0) |
143 | 0 | return(NULL); |
144 | | |
145 | 0 | for (attr = elem->properties; attr != NULL; attr = attr->next) { |
146 | 0 | if (attr->ns != NULL) |
147 | 0 | continue; |
148 | 0 | if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) { |
149 | 0 | *outIsContentType = 0; |
150 | 0 | return(attr); |
151 | 0 | } |
152 | 0 | if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0) |
153 | 0 | contentAttr = attr; |
154 | 0 | if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) && |
155 | 0 | (attr->children != NULL) && |
156 | 0 | (attr->children->type == XML_TEXT_NODE) && |
157 | 0 | (attr->children->next == NULL) && |
158 | 0 | (xmlStrcasecmp(attr->children->content, |
159 | 0 | BAD_CAST "Content-Type") == 0)) |
160 | 0 | isContentType = 1; |
161 | 0 | } |
162 | | |
163 | 0 | if ((isContentType) && (contentAttr != NULL)) { |
164 | 0 | *outIsContentType = 1; |
165 | 0 | return(contentAttr); |
166 | 0 | } |
167 | | |
168 | 0 | return(NULL); |
169 | 0 | } |
170 | | |
171 | | static int |
172 | 0 | htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) { |
173 | 0 | xmlAttrPtr attr; |
174 | 0 | const xmlChar *val = NULL; |
175 | 0 | int isContentType; |
176 | |
|
177 | 0 | if ((elem->type != XML_ELEMENT_NODE) || |
178 | 0 | (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)) |
179 | 0 | return(0); |
180 | | |
181 | 0 | attr = htmlFindMetaEncodingAttr(elem, &isContentType); |
182 | 0 | if (attr == NULL) |
183 | 0 | return(0); |
184 | | |
185 | 0 | if ((attr->children != NULL) && |
186 | 0 | (attr->children->type == XML_TEXT_NODE) && |
187 | 0 | (attr->children->next == NULL) && |
188 | 0 | (attr->children->content != NULL)) |
189 | 0 | val = attr->children->content; |
190 | 0 | else |
191 | 0 | val = BAD_CAST ""; |
192 | | |
193 | |
|
194 | 0 | if (!isContentType) { |
195 | 0 | size_t size = strlen((char *) val); |
196 | 0 | size_t start = 0; |
197 | 0 | size_t end = size; |
198 | |
|
199 | 0 | while ((start < size) && (IS_WS_HTML(val[start]))) |
200 | 0 | start += 1; |
201 | |
|
202 | 0 | while ((end > 0) && (IS_WS_HTML(val[end-1]))) |
203 | 0 | end -= 1; |
204 | |
|
205 | 0 | menc->attr = attr; |
206 | 0 | menc->attrValue = val; |
207 | 0 | menc->off.start = start; |
208 | 0 | menc->off.end = end; |
209 | 0 | menc->off.size = size; |
210 | |
|
211 | 0 | return(1); |
212 | 0 | } else { |
213 | 0 | if (htmlParseContentType(val, &menc->off)) { |
214 | 0 | menc->attr = attr; |
215 | 0 | menc->attrValue = val; |
216 | |
|
217 | 0 | return(1); |
218 | 0 | } |
219 | 0 | } |
220 | | |
221 | 0 | return(0); |
222 | 0 | } |
223 | | |
224 | | static xmlChar * |
225 | 0 | htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) { |
226 | 0 | xmlChar *newVal, *p; |
227 | 0 | size_t size, oldEncSize, newEncSize; |
228 | | |
229 | | /* |
230 | | * The pseudo "HTML" encoding only produces ASCII. |
231 | | */ |
232 | 0 | if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0) |
233 | 0 | encoding = "ASCII"; |
234 | |
|
235 | 0 | oldEncSize = menc->off.end - menc->off.start; |
236 | 0 | newEncSize = strlen((char *) encoding); |
237 | 0 | size = menc->off.size - oldEncSize + newEncSize; |
238 | 0 | newVal = xmlMalloc(size + 1); |
239 | 0 | if (newVal == NULL) |
240 | 0 | return(NULL); |
241 | | |
242 | 0 | p = newVal; |
243 | 0 | memcpy(p, menc->attrValue, menc->off.start); |
244 | 0 | p += menc->off.start; |
245 | 0 | memcpy(p, encoding, newEncSize); |
246 | 0 | p += newEncSize; |
247 | 0 | memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end); |
248 | 0 | newVal[size] = 0; |
249 | |
|
250 | 0 | return(newVal); |
251 | 0 | } |
252 | | |
253 | | /** |
254 | | * Look up and encoding declaration in the meta tags. |
255 | | * |
256 | | * The returned string points into attribute content and can contain |
257 | | * trailing garbage. It should be copied before modifying or freeing |
258 | | * nodes. |
259 | | * |
260 | | * @param doc the document |
261 | | * @returns the encoding ot NULL if not found. |
262 | | */ |
263 | | const xmlChar * |
264 | 0 | htmlGetMetaEncoding(xmlDoc *doc) { |
265 | 0 | htmlNodePtr head, node; |
266 | |
|
267 | 0 | head = htmlFindHead(doc); |
268 | 0 | if (head == NULL) |
269 | 0 | return(NULL); |
270 | | |
271 | 0 | for (node = head->children; node != NULL; node = node->next) { |
272 | 0 | htmlMetaEncoding menc; |
273 | |
|
274 | 0 | if (htmlParseMetaEncoding(node, &menc)) { |
275 | | /* |
276 | | * Returning a `const xmlChar *` only allows to return |
277 | | * a suffix. In http-equiv meta tags, there could be |
278 | | * more data after the charset, although it's probably |
279 | | * rare in practice. |
280 | | */ |
281 | 0 | return(menc.attrValue + menc.off.start); |
282 | 0 | } |
283 | 0 | } |
284 | | |
285 | 0 | return(NULL); |
286 | 0 | } |
287 | | |
288 | | /** |
289 | | * Creates or updates a meta tag with an encoding declaration. |
290 | | * |
291 | | * NOTE: This will not change the document content encoding. |
292 | | * |
293 | | * @param doc the document |
294 | | * @param encoding the encoding string |
295 | | * @returns 0 in case of success, 1 if no head element was found or |
296 | | * arguments are invalid and -1 if memory allocation failed. |
297 | | */ |
298 | | int |
299 | 0 | htmlSetMetaEncoding(xmlDoc *doc, const xmlChar *encoding) { |
300 | 0 | htmlNodePtr head, meta; |
301 | 0 | int found = 0; |
302 | |
|
303 | 0 | if (encoding == NULL) |
304 | 0 | return(1); |
305 | | |
306 | 0 | head = htmlFindHead(doc); |
307 | 0 | if (head == NULL) |
308 | 0 | return(1); |
309 | | |
310 | 0 | for (meta = head->children; meta != NULL; meta = meta->next) { |
311 | 0 | htmlMetaEncoding menc; |
312 | |
|
313 | 0 | if (htmlParseMetaEncoding(meta, &menc)) { |
314 | 0 | xmlChar *newVal; |
315 | 0 | int ret; |
316 | |
|
317 | 0 | found = 1; |
318 | |
|
319 | 0 | newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding); |
320 | 0 | if (newVal == NULL) |
321 | 0 | return(-1); |
322 | 0 | xmlNodeSetContent((xmlNodePtr) menc.attr, NULL); |
323 | 0 | ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal); |
324 | 0 | xmlFree(newVal); |
325 | |
|
326 | 0 | if (ret < 0) |
327 | 0 | return(-1); |
328 | 0 | } |
329 | 0 | } |
330 | | |
331 | 0 | if (found) |
332 | 0 | return(0); |
333 | | |
334 | 0 | meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL); |
335 | 0 | if (meta == NULL) |
336 | 0 | return(-1); |
337 | | |
338 | 0 | if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) { |
339 | 0 | xmlFreeNode(meta); |
340 | 0 | return(-1); |
341 | 0 | } |
342 | | |
343 | 0 | if (head->children == NULL) |
344 | 0 | xmlAddChild(head, meta); |
345 | 0 | else |
346 | 0 | xmlAddPrevSibling(head->children, meta); |
347 | |
|
348 | 0 | return(0); |
349 | 0 | } |
350 | | |
351 | | /** |
352 | | * Determine if a given attribute is a boolean attribute. This |
353 | | * doesn't handle HTML5. |
354 | | * |
355 | | * @deprecated Internal function, don't use. |
356 | | * |
357 | | * @param name the name of the attribute to check |
358 | | * @returns false if the attribute is not boolean, true otherwise. |
359 | | */ |
360 | | int |
361 | | htmlIsBooleanAttr(const xmlChar *name) |
362 | 0 | { |
363 | 0 | const char *str = NULL; |
364 | |
|
365 | 0 | if (name == NULL) |
366 | 0 | return(0); |
367 | | |
368 | | /* |
369 | | * These are the HTML attributes which will be output |
370 | | * in minimized form, i.e. `<option selected="selected">` will be |
371 | | * output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output |
372 | | * Method": |
373 | | * |
374 | | * "checked", "compact", "declare", "defer", "disabled", "ismap", |
375 | | * "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", |
376 | | * "selected" |
377 | | * |
378 | | * Additional attributes from HTML5 (not implemented yet): |
379 | | * |
380 | | * "allowfullscreen", "alpha", "async", "autofocus", "autoplay", |
381 | | * "controls", "default", "formnovalidate", "inert", "itemscope", |
382 | | * "loop", "muted", "nomodule", "novalidate", "open", "playsinline", |
383 | | * "required", "reversed", "shadowrootdelegatesfocus", |
384 | | * "shadowrootclonable", "shadowrootserializable", |
385 | | * "shadowrootcustomelementregistry", "truespeed" |
386 | | */ |
387 | | |
388 | 0 | switch (name[0] | 0x20) { |
389 | 0 | case 'c': |
390 | 0 | name += 1; |
391 | 0 | switch (name[0] | 0x20) { |
392 | 0 | case 'h': str = "ecked"; break; |
393 | 0 | case 'o': str = "mpact"; break; |
394 | 0 | } |
395 | 0 | break; |
396 | 0 | case 'd': |
397 | 0 | name += 1; |
398 | 0 | switch (name[0] | 0x20) { |
399 | 0 | case 'e': |
400 | 0 | name += 1; |
401 | 0 | switch (name[0] | 0x20) { |
402 | 0 | case 'c': str = "lare"; break; |
403 | 0 | case 'f': str = "er"; break; |
404 | 0 | } |
405 | 0 | break; |
406 | 0 | case 'i': str = "sabled"; break; |
407 | 0 | } |
408 | 0 | break; |
409 | 0 | case 'i': |
410 | 0 | str = "smap"; |
411 | 0 | break; |
412 | 0 | case 'm': |
413 | 0 | str = "ultiple"; |
414 | 0 | break; |
415 | 0 | case 'n': |
416 | 0 | name += 1; |
417 | 0 | if ((name[0] | 0x20) != 'o') |
418 | 0 | break; |
419 | 0 | name += 1; |
420 | 0 | switch (name[0] | 0x20) { |
421 | 0 | case 'h': str = "ref"; break; |
422 | 0 | case 'r': str = "esize"; break; |
423 | 0 | case 's': str = "hade"; break; |
424 | 0 | case 'w': str = "rap"; break; |
425 | 0 | } |
426 | 0 | break; |
427 | 0 | case 'r': |
428 | 0 | str = "eadonly"; |
429 | 0 | break; |
430 | 0 | case 's': |
431 | 0 | str = "elected"; |
432 | 0 | break; |
433 | 0 | } |
434 | | |
435 | 0 | if (str == NULL) |
436 | 0 | return(0); |
437 | | |
438 | 0 | return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0); |
439 | 0 | } |
440 | | |
441 | | #ifdef LIBXML_OUTPUT_ENABLED |
442 | | /************************************************************************ |
443 | | * * |
444 | | * Dumping HTML tree content to a simple buffer * |
445 | | * * |
446 | | ************************************************************************/ |
447 | | |
448 | | static xmlParserErrors |
449 | 0 | htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) { |
450 | | /* |
451 | | * Fallback to HTML if the encoding is unspecified |
452 | | */ |
453 | 0 | if (encoding == NULL) |
454 | 0 | encoding = "HTML"; |
455 | |
|
456 | 0 | return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out)); |
457 | 0 | } |
458 | | |
459 | | /** |
460 | | * Serialize an HTML document to an xmlBuf. |
461 | | * |
462 | | * @param buf the xmlBuf output |
463 | | * @param doc the document (unused) |
464 | | * @param cur the current node |
465 | | * @param format should formatting newlines been added |
466 | | * @returns the number of bytes written or -1 in case of error |
467 | | */ |
468 | | static size_t |
469 | | htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED, |
470 | 0 | xmlNodePtr cur, int format) { |
471 | 0 | size_t use; |
472 | 0 | size_t ret; |
473 | 0 | xmlOutputBufferPtr outbuf; |
474 | |
|
475 | 0 | if (cur == NULL) { |
476 | 0 | return ((size_t) -1); |
477 | 0 | } |
478 | 0 | if (buf == NULL) { |
479 | 0 | return ((size_t) -1); |
480 | 0 | } |
481 | 0 | outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); |
482 | 0 | if (outbuf == NULL) |
483 | 0 | return ((size_t) -1); |
484 | 0 | memset(outbuf, 0, sizeof(xmlOutputBuffer)); |
485 | 0 | outbuf->buffer = buf; |
486 | 0 | outbuf->encoder = NULL; |
487 | 0 | outbuf->writecallback = NULL; |
488 | 0 | outbuf->closecallback = NULL; |
489 | 0 | outbuf->context = NULL; |
490 | 0 | outbuf->written = 0; |
491 | |
|
492 | 0 | use = xmlBufUse(buf); |
493 | 0 | htmlNodeDumpInternal(outbuf, cur, NULL, format); |
494 | 0 | if (outbuf->error) |
495 | 0 | ret = (size_t) -1; |
496 | 0 | else |
497 | 0 | ret = xmlBufUse(buf) - use; |
498 | 0 | xmlFree(outbuf); |
499 | 0 | return (ret); |
500 | 0 | } |
501 | | |
502 | | /** |
503 | | * Serialize an HTML node to an xmlBuffer. Always uses UTF-8. |
504 | | * |
505 | | * @param buf the HTML buffer output |
506 | | * @param doc the document |
507 | | * @param cur the current node |
508 | | * @returns the number of bytes written or -1 in case of error |
509 | | */ |
510 | | int |
511 | 0 | htmlNodeDump(xmlBuffer *buf, xmlDoc *doc, xmlNode *cur) { |
512 | 0 | xmlBufPtr buffer; |
513 | 0 | size_t ret1; |
514 | 0 | int ret2; |
515 | |
|
516 | 0 | if ((buf == NULL) || (cur == NULL)) |
517 | 0 | return(-1); |
518 | | |
519 | 0 | xmlInitParser(); |
520 | 0 | buffer = xmlBufFromBuffer(buf); |
521 | 0 | if (buffer == NULL) |
522 | 0 | return(-1); |
523 | | |
524 | 0 | ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1); |
525 | |
|
526 | 0 | ret2 = xmlBufBackToBuffer(buffer, buf); |
527 | |
|
528 | 0 | if ((ret1 == (size_t) -1) || (ret2 < 0)) |
529 | 0 | return(-1); |
530 | 0 | return(ret1 > INT_MAX ? INT_MAX : ret1); |
531 | 0 | } |
532 | | |
533 | | /** |
534 | | * Serialize an HTML node to an xmlBuffer. |
535 | | * |
536 | | * If encoding is NULL, ASCII with HTML 4.0 named character entities |
537 | | * will be used. This is inefficient compared to UTF-8 and might be |
538 | | * changed in a future version. |
539 | | * |
540 | | * @param out the FILE pointer |
541 | | * @param doc the document (unused) |
542 | | * @param cur the current node |
543 | | * @param encoding the document encoding (optional) |
544 | | * @param format should formatting newlines been added |
545 | | * @returns the number of bytes written or -1 in case of failure. |
546 | | */ |
547 | | int |
548 | | htmlNodeDumpFileFormat(FILE *out, xmlDoc *doc ATTRIBUTE_UNUSED, |
549 | 0 | xmlNode *cur, const char *encoding, int format) { |
550 | 0 | xmlOutputBufferPtr buf; |
551 | 0 | xmlCharEncodingHandlerPtr handler; |
552 | 0 | int ret; |
553 | |
|
554 | 0 | xmlInitParser(); |
555 | | |
556 | | /* |
557 | | * save the content to a temp buffer. |
558 | | */ |
559 | 0 | if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) |
560 | 0 | return(-1); |
561 | 0 | buf = xmlOutputBufferCreateFile(out, handler); |
562 | 0 | if (buf == NULL) { |
563 | 0 | xmlCharEncCloseFunc(handler); |
564 | 0 | return(-1); |
565 | 0 | } |
566 | | |
567 | 0 | htmlNodeDumpInternal(buf, cur, NULL, format); |
568 | |
|
569 | 0 | ret = xmlOutputBufferClose(buf); |
570 | 0 | return(ret); |
571 | 0 | } |
572 | | |
573 | | /** |
574 | | * Same as #htmlNodeDumpFileFormat with `format` set to 1 which is |
575 | | * typically undesired. Use of this function is DISCOURAGED in favor |
576 | | * of #htmlNodeDumpFileFormat. |
577 | | * |
578 | | * @param out the FILE pointer |
579 | | * @param doc the document |
580 | | * @param cur the current node |
581 | | */ |
582 | | void |
583 | 0 | htmlNodeDumpFile(FILE *out, xmlDoc *doc, xmlNode *cur) { |
584 | 0 | htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); |
585 | 0 | } |
586 | | |
587 | | /** |
588 | | * Serialize an HTML node to a memory, also returning the size of |
589 | | * the result. It's up to the caller to free the memory. |
590 | | * |
591 | | * Uses the encoding of the document. If the document has no |
592 | | * encoding, ASCII with HTML 4.0 named character entities will |
593 | | * be used. This is inefficient compared to UTF-8 and might be |
594 | | * changed in a future version. |
595 | | * |
596 | | * @param cur the document |
597 | | * @param mem OUT: the memory pointer |
598 | | * @param size OUT: the memory length |
599 | | * @param format should formatting newlines been added |
600 | | */ |
601 | | void |
602 | 0 | htmlDocDumpMemoryFormat(xmlDoc *cur, xmlChar**mem, int *size, int format) { |
603 | 0 | xmlOutputBufferPtr buf; |
604 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
605 | |
|
606 | 0 | xmlInitParser(); |
607 | |
|
608 | 0 | if ((mem == NULL) || (size == NULL)) |
609 | 0 | return; |
610 | 0 | *mem = NULL; |
611 | 0 | *size = 0; |
612 | 0 | if (cur == NULL) |
613 | 0 | return; |
614 | | |
615 | 0 | if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) |
616 | 0 | return; |
617 | 0 | buf = xmlAllocOutputBuffer(handler); |
618 | 0 | if (buf == NULL) { |
619 | 0 | xmlCharEncCloseFunc(handler); |
620 | 0 | return; |
621 | 0 | } |
622 | | |
623 | 0 | htmlDocContentDumpFormatOutput(buf, cur, NULL, format); |
624 | |
|
625 | 0 | xmlOutputBufferFlush(buf); |
626 | |
|
627 | 0 | if (!buf->error) { |
628 | 0 | if (buf->conv != NULL) { |
629 | 0 | *size = xmlBufUse(buf->conv); |
630 | 0 | *mem = xmlStrndup(xmlBufContent(buf->conv), *size); |
631 | 0 | } else { |
632 | 0 | *size = xmlBufUse(buf->buffer); |
633 | 0 | *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); |
634 | 0 | } |
635 | 0 | } |
636 | |
|
637 | 0 | xmlOutputBufferClose(buf); |
638 | 0 | } |
639 | | |
640 | | /** |
641 | | * Same as #htmlDocDumpMemoryFormat with `format` set to 1 which |
642 | | * is typically undesired. Also see the warnings there. Use of |
643 | | * this function is DISCOURAGED in favor of |
644 | | * #htmlDocContentDumpFormatOutput. |
645 | | * |
646 | | * @param cur the document |
647 | | * @param mem OUT: the memory pointer |
648 | | * @param size OUT: the memory length |
649 | | */ |
650 | | void |
651 | 0 | htmlDocDumpMemory(xmlDoc *cur, xmlChar**mem, int *size) { |
652 | 0 | htmlDocDumpMemoryFormat(cur, mem, size, 1); |
653 | 0 | } |
654 | | |
655 | | |
656 | | /************************************************************************ |
657 | | * * |
658 | | * Dumping HTML tree content to an I/O output buffer * |
659 | | * * |
660 | | ************************************************************************/ |
661 | | |
662 | | /** |
663 | | * Serialize the HTML document's DTD, if any. |
664 | | * |
665 | | * Ignores `encoding` and uses the encoding of the output buffer. |
666 | | * |
667 | | * @param buf the HTML buffer output |
668 | | * @param doc the document |
669 | | * @param encoding the encoding string (unused) |
670 | | */ |
671 | | static void |
672 | | htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
673 | 0 | const char *encoding ATTRIBUTE_UNUSED) { |
674 | 0 | xmlDtdPtr cur = doc->intSubset; |
675 | |
|
676 | 0 | if (cur == NULL) |
677 | 0 | return; |
678 | 0 | xmlOutputBufferWrite(buf, 10, "<!DOCTYPE "); |
679 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
680 | 0 | if (cur->ExternalID != NULL) { |
681 | 0 | xmlOutputBufferWrite(buf, 8, " PUBLIC "); |
682 | 0 | xmlOutputBufferWriteQuotedString(buf, cur->ExternalID); |
683 | 0 | if (cur->SystemID != NULL) { |
684 | 0 | xmlOutputBufferWrite(buf, 1, " "); |
685 | 0 | xmlOutputBufferWriteQuotedString(buf, cur->SystemID); |
686 | 0 | } |
687 | 0 | } else if (cur->SystemID != NULL && |
688 | 0 | xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { |
689 | 0 | xmlOutputBufferWrite(buf, 8, " SYSTEM "); |
690 | 0 | xmlOutputBufferWriteQuotedString(buf, cur->SystemID); |
691 | 0 | } |
692 | 0 | xmlOutputBufferWrite(buf, 2, ">\n"); |
693 | 0 | } |
694 | | |
695 | | static void |
696 | 0 | htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) { |
697 | 0 | const xmlChar *tmp = content; |
698 | | |
699 | | /* |
700 | | * See appendix "B.2.1 Non-ASCII characters in URI attribute |
701 | | * values" in the HTML 4.01 spec. This is also recommended |
702 | | * by the HTML output method of the XSLT 1.0 spec. |
703 | | * |
704 | | * We also escape space and control chars. |
705 | | */ |
706 | | |
707 | | /* Skip over initial whitespace */ |
708 | 0 | while (IS_WS_HTML(*tmp)) tmp++; |
709 | 0 | if (tmp > content) { |
710 | 0 | xmlOutputBufferWrite(buf, tmp - content, (char *) content); |
711 | 0 | content = tmp; |
712 | 0 | } |
713 | |
|
714 | 0 | while (1) { |
715 | 0 | char escbuf[3]; |
716 | 0 | const char *repl; |
717 | 0 | int replSize; |
718 | 0 | int c = *tmp; |
719 | |
|
720 | 0 | while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) { |
721 | 0 | tmp += 1; |
722 | 0 | c = *tmp; |
723 | 0 | } |
724 | |
|
725 | 0 | if (tmp > content) |
726 | 0 | xmlOutputBufferWrite(buf, tmp - content, (char *) content); |
727 | |
|
728 | 0 | if ((c <= 0x20) || (c >= 0x7F)) { |
729 | 0 | static const char hex[16] = { |
730 | 0 | '0', '1', '2', '3', '4', '5', '6', '7', |
731 | 0 | '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' |
732 | 0 | }; |
733 | |
|
734 | 0 | if (c == 0) |
735 | 0 | break; |
736 | | |
737 | 0 | escbuf[0] = '%'; |
738 | 0 | escbuf[1] = hex[(c >> 4) & 0x0F]; |
739 | 0 | escbuf[2] = hex[c & 0x0F]; |
740 | 0 | repl = escbuf; |
741 | 0 | replSize = 3; |
742 | 0 | } else if (c == '"') { |
743 | 0 | repl = """; |
744 | 0 | replSize = 6; |
745 | 0 | } else { |
746 | 0 | repl = "&"; |
747 | 0 | replSize = 5; |
748 | 0 | } |
749 | | |
750 | 0 | xmlOutputBufferWrite(buf, replSize, repl); |
751 | 0 | tmp += 1; |
752 | 0 | content = tmp; |
753 | 0 | } |
754 | 0 | } |
755 | | |
756 | | /** |
757 | | * Serialize an HTML attribute. |
758 | | * |
759 | | * @param buf the HTML buffer output |
760 | | * @param cur the attribute pointer |
761 | | */ |
762 | | static void |
763 | 0 | htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) { |
764 | 0 | xmlOutputBufferWrite(buf, 1, " "); |
765 | |
|
766 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
767 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
768 | 0 | xmlOutputBufferWrite(buf, 1, ":"); |
769 | 0 | } |
770 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
771 | | |
772 | | /* |
773 | | * The HTML5 spec requires to always serialize empty attribute |
774 | | * values as `=""`. We should probably align with HTML5 at some |
775 | | * point. |
776 | | */ |
777 | 0 | if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { |
778 | 0 | xmlNodePtr child; |
779 | 0 | int isUri; |
780 | |
|
781 | 0 | xmlOutputBufferWrite(buf, 2, "=\""); |
782 | | |
783 | | /* |
784 | | * Special handling of URIs doesn't conform to HTML5 and |
785 | | * should probably be removed at some point. |
786 | | */ |
787 | 0 | isUri = (cur->ns == NULL) && (cur->parent != NULL) && |
788 | 0 | (cur->parent->ns == NULL) && |
789 | 0 | ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || |
790 | 0 | (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || |
791 | 0 | (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || |
792 | 0 | ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && |
793 | 0 | (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a")))); |
794 | |
|
795 | 0 | for (child = cur->children; child != NULL; child = child->next) { |
796 | 0 | if (child->type == XML_TEXT_NODE) { |
797 | 0 | const xmlChar *content = child->content; |
798 | |
|
799 | 0 | if (content == NULL) |
800 | 0 | continue; |
801 | | |
802 | 0 | if (isUri) { |
803 | 0 | htmlSerializeUri(buf, content); |
804 | 0 | } else { |
805 | 0 | xmlSerializeText(buf, content, SIZE_MAX, |
806 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
807 | 0 | } |
808 | 0 | } else if (child->type == XML_ENTITY_REF_NODE) { |
809 | | /* TODO: We should probably expand entity refs */ |
810 | 0 | xmlOutputBufferWrite(buf, 1, "&"); |
811 | 0 | xmlOutputBufferWriteString(buf, (char *) child->name); |
812 | 0 | xmlOutputBufferWrite(buf, 1, ";"); |
813 | 0 | } |
814 | 0 | } |
815 | |
|
816 | 0 | xmlOutputBufferWrite(buf, 1, "\""); |
817 | 0 | } |
818 | 0 | } |
819 | | |
820 | | /** |
821 | | * Serialize an HTML node to an output buffer. |
822 | | * |
823 | | * If `encoding` is specified, it is used to create or update meta |
824 | | * tags containing the character encoding. |
825 | | * |
826 | | * @param buf the HTML buffer output |
827 | | * @param cur the current node |
828 | | * @param encoding the encoding string (optional) |
829 | | * @param format should formatting newlines been added |
830 | | */ |
831 | | void |
832 | | htmlNodeDumpInternal(xmlOutputBuffer *buf, xmlNode *cur, |
833 | 0 | const char *encoding, int format) { |
834 | 0 | xmlNodePtr root, parent, metaHead = NULL; |
835 | 0 | xmlAttrPtr attr; |
836 | 0 | const htmlElemDesc * info; |
837 | 0 | int isRaw = 0; |
838 | |
|
839 | 0 | xmlInitParser(); |
840 | |
|
841 | 0 | if ((cur == NULL) || (buf == NULL)) { |
842 | 0 | return; |
843 | 0 | } |
844 | | |
845 | 0 | root = cur; |
846 | 0 | parent = cur->parent; |
847 | 0 | while (1) { |
848 | 0 | switch (cur->type) { |
849 | 0 | case XML_HTML_DOCUMENT_NODE: |
850 | 0 | case XML_DOCUMENT_NODE: |
851 | 0 | if (((xmlDocPtr) cur)->intSubset != NULL) { |
852 | 0 | htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); |
853 | 0 | } |
854 | 0 | if (cur->children != NULL) { |
855 | | /* Always validate cur->parent when descending. */ |
856 | 0 | if (cur->parent == parent) { |
857 | 0 | parent = cur; |
858 | 0 | cur = cur->children; |
859 | 0 | continue; |
860 | 0 | } |
861 | 0 | } else { |
862 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
863 | 0 | } |
864 | 0 | break; |
865 | | |
866 | 0 | case XML_ELEMENT_NODE: { |
867 | 0 | htmlMetaEncoding menc; |
868 | 0 | int isMeta = 0; |
869 | 0 | int addMeta = 0; |
870 | | |
871 | | /* |
872 | | * Some users like lxml are known to pass nodes with a corrupted |
873 | | * tree structure. Fall back to a recursive call to handle this |
874 | | * case. |
875 | | */ |
876 | 0 | if ((cur->parent != parent) && (cur->children != NULL)) { |
877 | 0 | htmlNodeDumpInternal(buf, cur, encoding, format); |
878 | 0 | break; |
879 | 0 | } |
880 | | |
881 | | /* |
882 | | * Get specific HTML info for that node. |
883 | | */ |
884 | 0 | if (cur->ns == NULL) |
885 | 0 | info = htmlTagLookup(cur->name); |
886 | 0 | else |
887 | 0 | info = NULL; |
888 | |
|
889 | 0 | if (encoding != NULL) { |
890 | 0 | isMeta = htmlParseMetaEncoding(cur, &menc); |
891 | | |
892 | | /* |
893 | | * Don't add meta tag for "HTML" encoding. |
894 | | */ |
895 | 0 | if ((xmlStrcasecmp(BAD_CAST encoding, |
896 | 0 | BAD_CAST "HTML") != 0) && |
897 | 0 | (xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) && |
898 | 0 | (parent != NULL) && |
899 | 0 | (xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) && |
900 | 0 | (parent->parent != NULL) && |
901 | 0 | (parent->parent->parent == NULL) && |
902 | 0 | (metaHead == NULL)) { |
903 | 0 | xmlNodePtr n; |
904 | |
|
905 | 0 | metaHead = cur; |
906 | 0 | addMeta = 1; |
907 | |
|
908 | 0 | for (n = cur->children; n != NULL; n = n->next) { |
909 | 0 | int unused; |
910 | |
|
911 | 0 | if (htmlFindMetaEncodingAttr(n, &unused) != NULL) { |
912 | 0 | metaHead = NULL; |
913 | 0 | addMeta = 0; |
914 | 0 | break; |
915 | 0 | } |
916 | 0 | } |
917 | 0 | } |
918 | 0 | } |
919 | |
|
920 | 0 | xmlOutputBufferWrite(buf, 1, "<"); |
921 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
922 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
923 | 0 | xmlOutputBufferWrite(buf, 1, ":"); |
924 | 0 | } |
925 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
926 | 0 | if (cur->nsDef) |
927 | 0 | xmlNsListDumpOutput(buf, cur->nsDef); |
928 | 0 | attr = cur->properties; |
929 | 0 | while (attr != NULL) { |
930 | 0 | if ((!isMeta) || (attr != menc.attr)) { |
931 | 0 | htmlAttrDumpOutput(buf, attr); |
932 | 0 | } else { |
933 | 0 | xmlOutputBufferWrite(buf, 1, " "); |
934 | 0 | xmlOutputBufferWriteString(buf, (char *) attr->name); |
935 | |
|
936 | 0 | xmlOutputBufferWrite(buf, 2, "=\""); |
937 | 0 | xmlSerializeText(buf, menc.attrValue, menc.off.start, |
938 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
939 | 0 | xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
940 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
941 | 0 | xmlSerializeText(buf, menc.attrValue + menc.off.end, |
942 | 0 | menc.off.size - menc.off.end, |
943 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
944 | 0 | xmlOutputBufferWrite(buf, 1, "\""); |
945 | 0 | } |
946 | 0 | attr = attr->next; |
947 | 0 | } |
948 | |
|
949 | 0 | if ((info != NULL) && (info->empty)) { |
950 | 0 | xmlOutputBufferWrite(buf, 1, ">"); |
951 | 0 | } else if (cur->children == NULL) { |
952 | 0 | if (addMeta) { |
953 | 0 | xmlOutputBufferWrite(buf, 16, "><meta charset=\""); |
954 | 0 | xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
955 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
956 | 0 | xmlOutputBufferWrite(buf, 4, "\"></"); |
957 | 0 | } else { |
958 | 0 | xmlOutputBufferWrite(buf, 3, "></"); |
959 | 0 | } |
960 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
961 | 0 | xmlOutputBufferWriteString(buf, |
962 | 0 | (const char *)cur->ns->prefix); |
963 | 0 | xmlOutputBufferWrite(buf, 1, ":"); |
964 | 0 | } |
965 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
966 | 0 | xmlOutputBufferWrite(buf, 1, ">"); |
967 | 0 | } else { |
968 | 0 | xmlOutputBufferWrite(buf, 1, ">"); |
969 | 0 | if ((format) && |
970 | 0 | ((addMeta) || |
971 | 0 | ((info != NULL) && (!info->isinline) && |
972 | 0 | (cur->children->type != HTML_TEXT_NODE) && |
973 | 0 | (cur->children->type != HTML_ENTITY_REF_NODE) && |
974 | 0 | (cur->children != cur->last) && |
975 | 0 | (cur->name != NULL) && |
976 | 0 | (cur->name[0] != 'p')))) /* p, pre, param */ |
977 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
978 | 0 | if (addMeta) { |
979 | 0 | xmlOutputBufferWrite(buf, 15, "<meta charset=\""); |
980 | 0 | xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX, |
981 | 0 | XML_ESCAPE_HTML | XML_ESCAPE_ATTR); |
982 | 0 | xmlOutputBufferWrite(buf, 2, "\">"); |
983 | 0 | if ((format) && |
984 | 0 | (cur->children->type != HTML_TEXT_NODE) && |
985 | 0 | (cur->children->type != HTML_ENTITY_REF_NODE)) |
986 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
987 | 0 | } |
988 | |
|
989 | 0 | if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT)) |
990 | 0 | isRaw = 1; |
991 | |
|
992 | 0 | parent = cur; |
993 | 0 | cur = cur->children; |
994 | 0 | continue; |
995 | 0 | } |
996 | | |
997 | 0 | if ((format) && (cur->next != NULL) && |
998 | 0 | (info != NULL) && (!info->isinline)) { |
999 | 0 | if ((cur->next->type != HTML_TEXT_NODE) && |
1000 | 0 | (cur->next->type != HTML_ENTITY_REF_NODE) && |
1001 | 0 | (parent != NULL) && |
1002 | 0 | (parent->name != NULL) && |
1003 | 0 | (parent->name[0] != 'p')) /* p, pre, param */ |
1004 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
1005 | 0 | } |
1006 | |
|
1007 | 0 | break; |
1008 | 0 | } |
1009 | | |
1010 | 0 | case XML_ATTRIBUTE_NODE: |
1011 | 0 | htmlAttrDumpOutput(buf, (xmlAttrPtr) cur); |
1012 | 0 | break; |
1013 | | |
1014 | 0 | case HTML_TEXT_NODE: |
1015 | 0 | if (cur->content == NULL) |
1016 | 0 | break; |
1017 | 0 | if ((cur->name == (const xmlChar *)xmlStringTextNoenc) || |
1018 | 0 | (isRaw)) { |
1019 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
1020 | 0 | } else { |
1021 | 0 | xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML); |
1022 | 0 | } |
1023 | 0 | break; |
1024 | | |
1025 | 0 | case HTML_COMMENT_NODE: |
1026 | 0 | if (cur->content != NULL) { |
1027 | 0 | xmlOutputBufferWrite(buf, 4, "<!--"); |
1028 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
1029 | 0 | xmlOutputBufferWrite(buf, 3, "-->"); |
1030 | 0 | } |
1031 | 0 | break; |
1032 | | |
1033 | 0 | case HTML_PI_NODE: |
1034 | 0 | if (cur->name != NULL) { |
1035 | 0 | xmlOutputBufferWrite(buf, 2, "<?"); |
1036 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
1037 | 0 | if (cur->content != NULL) { |
1038 | 0 | xmlOutputBufferWrite(buf, 1, " "); |
1039 | 0 | xmlOutputBufferWriteString(buf, |
1040 | 0 | (const char *)cur->content); |
1041 | 0 | } |
1042 | 0 | xmlOutputBufferWrite(buf, 1, ">"); |
1043 | 0 | } |
1044 | 0 | break; |
1045 | | |
1046 | 0 | case HTML_ENTITY_REF_NODE: |
1047 | 0 | xmlOutputBufferWrite(buf, 1, "&"); |
1048 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
1049 | 0 | xmlOutputBufferWrite(buf, 1, ";"); |
1050 | 0 | break; |
1051 | | |
1052 | 0 | case HTML_PRESERVE_NODE: |
1053 | 0 | if (cur->content != NULL) { |
1054 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
1055 | 0 | } |
1056 | 0 | break; |
1057 | | |
1058 | 0 | default: |
1059 | 0 | break; |
1060 | 0 | } |
1061 | | |
1062 | 0 | while (1) { |
1063 | 0 | if (cur == root) |
1064 | 0 | return; |
1065 | 0 | if (cur->next != NULL) { |
1066 | 0 | cur = cur->next; |
1067 | 0 | break; |
1068 | 0 | } |
1069 | | |
1070 | 0 | isRaw = 0; |
1071 | |
|
1072 | 0 | cur = parent; |
1073 | | /* cur->parent was validated when descending. */ |
1074 | 0 | parent = cur->parent; |
1075 | |
|
1076 | 0 | if ((cur->type == XML_HTML_DOCUMENT_NODE) || |
1077 | 0 | (cur->type == XML_DOCUMENT_NODE)) { |
1078 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
1079 | 0 | } else { |
1080 | 0 | if ((format) && (cur->ns == NULL)) |
1081 | 0 | info = htmlTagLookup(cur->name); |
1082 | 0 | else |
1083 | 0 | info = NULL; |
1084 | |
|
1085 | 0 | if ((format) && (info != NULL) && (!info->isinline) && |
1086 | 0 | (cur->last->type != HTML_TEXT_NODE) && |
1087 | 0 | (cur->last->type != HTML_ENTITY_REF_NODE) && |
1088 | 0 | ((cur->children != cur->last) || (cur == metaHead)) && |
1089 | 0 | (cur->name != NULL) && |
1090 | 0 | (cur->name[0] != 'p')) /* p, pre, param */ |
1091 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
1092 | |
|
1093 | 0 | xmlOutputBufferWrite(buf, 2, "</"); |
1094 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
1095 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
1096 | 0 | xmlOutputBufferWrite(buf, 1, ":"); |
1097 | 0 | } |
1098 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
1099 | 0 | xmlOutputBufferWrite(buf, 1, ">"); |
1100 | |
|
1101 | 0 | if ((format) && (info != NULL) && (!info->isinline) && |
1102 | 0 | (cur->next != NULL)) { |
1103 | 0 | if ((cur->next->type != HTML_TEXT_NODE) && |
1104 | 0 | (cur->next->type != HTML_ENTITY_REF_NODE) && |
1105 | 0 | (parent != NULL) && |
1106 | 0 | (parent->name != NULL) && |
1107 | 0 | (parent->name[0] != 'p')) /* p, pre, param */ |
1108 | 0 | xmlOutputBufferWrite(buf, 1, "\n"); |
1109 | 0 | } |
1110 | |
|
1111 | 0 | if (cur == metaHead) |
1112 | 0 | metaHead = NULL; |
1113 | 0 | } |
1114 | 0 | } |
1115 | 0 | } |
1116 | 0 | } |
1117 | | |
1118 | | /** |
1119 | | * Serialize an HTML node to an output buffer. |
1120 | | * |
1121 | | * @param buf the HTML buffer output |
1122 | | * @param doc the document (unused) |
1123 | | * @param cur the current node |
1124 | | * @param encoding the encoding string (unused) |
1125 | | * @param format should formatting newlines been added |
1126 | | */ |
1127 | | void |
1128 | | htmlNodeDumpFormatOutput(xmlOutputBuffer *buf, |
1129 | | xmlDoc *doc ATTRIBUTE_UNUSED, xmlNode *cur, |
1130 | 0 | const char *encoding ATTRIBUTE_UNUSED, int format) { |
1131 | 0 | htmlNodeDumpInternal(buf, cur, NULL, format); |
1132 | 0 | } |
1133 | | |
1134 | | /** |
1135 | | * Same as #htmlNodeDumpFormatOutput with `format` set to 1 which is |
1136 | | * typically undesired. Use of this function is DISCOURAGED in favor |
1137 | | * of #htmlNodeDumpFormatOutput. |
1138 | | * |
1139 | | * @param buf the HTML buffer output |
1140 | | * @param doc the document (unused) |
1141 | | * @param cur the current node |
1142 | | * @param encoding the encoding string (unused) |
1143 | | */ |
1144 | | void |
1145 | | htmlNodeDumpOutput(xmlOutputBuffer *buf, xmlDoc *doc ATTRIBUTE_UNUSED, |
1146 | 0 | xmlNode *cur, const char *encoding ATTRIBUTE_UNUSED) { |
1147 | 0 | htmlNodeDumpInternal(buf, cur, NULL, 1); |
1148 | 0 | } |
1149 | | |
1150 | | /** |
1151 | | * Serialize an HTML document to an output buffer. |
1152 | | * |
1153 | | * @param buf the HTML buffer output |
1154 | | * @param cur the document |
1155 | | * @param encoding the encoding string (unused) |
1156 | | * @param format should formatting newlines been added |
1157 | | */ |
1158 | | void |
1159 | | htmlDocContentDumpFormatOutput(xmlOutputBuffer *buf, xmlDoc *cur, |
1160 | | const char *encoding ATTRIBUTE_UNUSED, |
1161 | 0 | int format) { |
1162 | 0 | htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format); |
1163 | 0 | } |
1164 | | |
1165 | | /** |
1166 | | * Same as #htmlDocContentDumpFormatOutput with `format` set to 1 |
1167 | | * which is typically undesired. Use of this function is DISCOURAGED |
1168 | | * in favor of #htmlDocContentDumpFormatOutput. |
1169 | | * |
1170 | | * @param buf the HTML buffer output |
1171 | | * @param cur the document |
1172 | | * @param encoding the encoding string (unused) |
1173 | | */ |
1174 | | void |
1175 | | htmlDocContentDumpOutput(xmlOutputBuffer *buf, xmlDoc *cur, |
1176 | 0 | const char *encoding ATTRIBUTE_UNUSED) { |
1177 | 0 | htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1); |
1178 | 0 | } |
1179 | | |
1180 | | /************************************************************************ |
1181 | | * * |
1182 | | * Saving functions front-ends * |
1183 | | * * |
1184 | | ************************************************************************/ |
1185 | | |
1186 | | /** |
1187 | | * Serialize an HTML document to an open `FILE`. |
1188 | | * |
1189 | | * Uses the encoding of the document. If the document has no |
1190 | | * encoding, ASCII with HTML 4.0 named character entities will |
1191 | | * be used. This is inefficient compared to UTF-8 and might be |
1192 | | * changed in a future version. |
1193 | | * |
1194 | | * Enables "formatting" unconditionally which is typically |
1195 | | * undesired. |
1196 | | * |
1197 | | * Use of this function is DISCOURAGED in favor of |
1198 | | * #htmlNodeDumpFileFormat. |
1199 | | * |
1200 | | * @param f the FILE* |
1201 | | * @param cur the document |
1202 | | * @returns the number of bytes written or -1 in case of failure. |
1203 | | */ |
1204 | | int |
1205 | 0 | htmlDocDump(FILE *f, xmlDoc *cur) { |
1206 | 0 | xmlOutputBufferPtr buf; |
1207 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
1208 | 0 | int ret; |
1209 | |
|
1210 | 0 | xmlInitParser(); |
1211 | |
|
1212 | 0 | if ((cur == NULL) || (f == NULL)) { |
1213 | 0 | return(-1); |
1214 | 0 | } |
1215 | | |
1216 | 0 | if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK) |
1217 | 0 | return(-1); |
1218 | 0 | buf = xmlOutputBufferCreateFile(f, handler); |
1219 | 0 | if (buf == NULL) { |
1220 | 0 | xmlCharEncCloseFunc(handler); |
1221 | 0 | return(-1); |
1222 | 0 | } |
1223 | 0 | htmlDocContentDumpOutput(buf, cur, NULL); |
1224 | |
|
1225 | 0 | ret = xmlOutputBufferClose(buf); |
1226 | 0 | return(ret); |
1227 | 0 | } |
1228 | | |
1229 | | /** |
1230 | | * Serialize an HTML document to a file. |
1231 | | * |
1232 | | * Same as #htmlSaveFileFormat with `encoding` set to NULL and |
1233 | | * `format` set to 1 which is typically undesired. |
1234 | | * |
1235 | | * Use of this function is DISCOURAGED in favor of |
1236 | | * #htmlSaveFileFormat. |
1237 | | * |
1238 | | * @param filename the filename (or URL) |
1239 | | * @param cur the document |
1240 | | * @returns the number of bytes written or -1 in case of failure. |
1241 | | */ |
1242 | | int |
1243 | 0 | htmlSaveFile(const char *filename, xmlDoc *cur) { |
1244 | 0 | return(htmlSaveFileFormat(filename, cur, NULL, 1)); |
1245 | 0 | } |
1246 | | |
1247 | | /** |
1248 | | * Serialize an HTML document to a file using a given encoding. |
1249 | | * |
1250 | | * If `filename` is `"-"`, stdout is used. This is potentially |
1251 | | * insecure and might be changed in a future version. |
1252 | | * |
1253 | | * If encoding is NULL, ASCII with HTML 4.0 named character entities |
1254 | | * will be used. This is inefficient compared to UTF-8 and might be |
1255 | | * changed in a future version. |
1256 | | * |
1257 | | * Sets or updates meta tags containing the character encoding. |
1258 | | * |
1259 | | * @param filename the filename |
1260 | | * @param cur the document |
1261 | | * @param format should formatting newlines been added |
1262 | | * @param encoding the document encoding (optional) |
1263 | | * @returns the number of bytes written or -1 in case of failure. |
1264 | | */ |
1265 | | int |
1266 | | htmlSaveFileFormat(const char *filename, xmlDoc *cur, |
1267 | 0 | const char *encoding, int format) { |
1268 | 0 | xmlOutputBufferPtr buf; |
1269 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
1270 | 0 | int ret; |
1271 | |
|
1272 | 0 | if ((cur == NULL) || (filename == NULL)) |
1273 | 0 | return(-1); |
1274 | | |
1275 | 0 | xmlInitParser(); |
1276 | |
|
1277 | 0 | if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK) |
1278 | 0 | return(-1); |
1279 | | |
1280 | | /* |
1281 | | * save the content to a temp buffer. |
1282 | | */ |
1283 | 0 | buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); |
1284 | 0 | if (buf == NULL) { |
1285 | 0 | xmlCharEncCloseFunc(handler); |
1286 | 0 | return(0); |
1287 | 0 | } |
1288 | | |
1289 | 0 | htmlDocContentDumpFormatOutput(buf, cur, encoding, format); |
1290 | |
|
1291 | 0 | ret = xmlOutputBufferClose(buf); |
1292 | 0 | return(ret); |
1293 | 0 | } |
1294 | | |
1295 | | /** |
1296 | | * Serialize an HTML document to a file. |
1297 | | * |
1298 | | * Same as #htmlSaveFileFormat with `format` set to 1 which is |
1299 | | * typically undesired. Also see the warnings there. Use of this |
1300 | | * function is DISCOURAGED in favor of #htmlSaveFileFormat. |
1301 | | * |
1302 | | * @param filename the filename |
1303 | | * @param cur the document |
1304 | | * @param encoding the document encoding |
1305 | | * @returns the number of bytes written or -1 in case of failure. |
1306 | | */ |
1307 | | int |
1308 | 0 | htmlSaveFileEnc(const char *filename, xmlDoc *cur, const char *encoding) { |
1309 | 0 | return(htmlSaveFileFormat(filename, cur, encoding, 1)); |
1310 | 0 | } |
1311 | | |
1312 | | #endif /* LIBXML_OUTPUT_ENABLED */ |
1313 | | |
1314 | | #endif /* LIBXML_HTML_ENABLED */ |