/src/libxml2-2.10.3/HTMLtree.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * HTMLtree.c : implementation of access function for an HTML tree. |
3 | | * |
4 | | * See Copyright for the status of this software. |
5 | | * |
6 | | * daniel@veillard.com |
7 | | */ |
8 | | |
9 | | |
10 | | #define IN_LIBXML |
11 | | #include "libxml.h" |
12 | | #ifdef LIBXML_HTML_ENABLED |
13 | | |
14 | | #include <string.h> /* for memset() only ! */ |
15 | | #include <ctype.h> |
16 | | #include <stdlib.h> |
17 | | |
18 | | #include <libxml/xmlmemory.h> |
19 | | #include <libxml/HTMLparser.h> |
20 | | #include <libxml/HTMLtree.h> |
21 | | #include <libxml/entities.h> |
22 | | #include <libxml/valid.h> |
23 | | #include <libxml/xmlerror.h> |
24 | | #include <libxml/parserInternals.h> |
25 | | #include <libxml/globals.h> |
26 | | #include <libxml/uri.h> |
27 | | |
28 | | #include "buf.h" |
29 | | |
30 | | /************************************************************************ |
31 | | * * |
32 | | * Getting/Setting encoding meta tags * |
33 | | * * |
34 | | ************************************************************************/ |
35 | | |
36 | | /** |
37 | | * htmlGetMetaEncoding: |
38 | | * @doc: the document |
39 | | * |
40 | | * Encoding definition lookup in the Meta tags |
41 | | * |
42 | | * Returns the current encoding as flagged in the HTML source |
43 | | */ |
44 | | const xmlChar * |
45 | 0 | htmlGetMetaEncoding(htmlDocPtr doc) { |
46 | 0 | htmlNodePtr cur; |
47 | 0 | const xmlChar *content; |
48 | 0 | const xmlChar *encoding; |
49 | |
|
50 | 0 | if (doc == NULL) |
51 | 0 | return(NULL); |
52 | 0 | cur = doc->children; |
53 | | |
54 | | /* |
55 | | * Search the html |
56 | | */ |
57 | 0 | while (cur != NULL) { |
58 | 0 | if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { |
59 | 0 | if (xmlStrEqual(cur->name, BAD_CAST"html")) |
60 | 0 | break; |
61 | 0 | if (xmlStrEqual(cur->name, BAD_CAST"head")) |
62 | 0 | goto found_head; |
63 | 0 | if (xmlStrEqual(cur->name, BAD_CAST"meta")) |
64 | 0 | goto found_meta; |
65 | 0 | } |
66 | 0 | cur = cur->next; |
67 | 0 | } |
68 | 0 | if (cur == NULL) |
69 | 0 | return(NULL); |
70 | 0 | cur = cur->children; |
71 | | |
72 | | /* |
73 | | * Search the head |
74 | | */ |
75 | 0 | while (cur != NULL) { |
76 | 0 | if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { |
77 | 0 | if (xmlStrEqual(cur->name, BAD_CAST"head")) |
78 | 0 | break; |
79 | 0 | if (xmlStrEqual(cur->name, BAD_CAST"meta")) |
80 | 0 | goto found_meta; |
81 | 0 | } |
82 | 0 | cur = cur->next; |
83 | 0 | } |
84 | 0 | if (cur == NULL) |
85 | 0 | return(NULL); |
86 | 0 | found_head: |
87 | 0 | cur = cur->children; |
88 | | |
89 | | /* |
90 | | * Search the meta elements |
91 | | */ |
92 | 0 | found_meta: |
93 | 0 | while (cur != NULL) { |
94 | 0 | if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { |
95 | 0 | if (xmlStrEqual(cur->name, BAD_CAST"meta")) { |
96 | 0 | xmlAttrPtr attr = cur->properties; |
97 | 0 | int http; |
98 | 0 | const xmlChar *value; |
99 | |
|
100 | 0 | content = NULL; |
101 | 0 | http = 0; |
102 | 0 | while (attr != NULL) { |
103 | 0 | if ((attr->children != NULL) && |
104 | 0 | (attr->children->type == XML_TEXT_NODE) && |
105 | 0 | (attr->children->next == NULL)) { |
106 | 0 | value = attr->children->content; |
107 | 0 | if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) |
108 | 0 | && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) |
109 | 0 | http = 1; |
110 | 0 | else if ((value != NULL) |
111 | 0 | && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) |
112 | 0 | content = value; |
113 | 0 | if ((http != 0) && (content != NULL)) |
114 | 0 | goto found_content; |
115 | 0 | } |
116 | 0 | attr = attr->next; |
117 | 0 | } |
118 | 0 | } |
119 | 0 | } |
120 | 0 | cur = cur->next; |
121 | 0 | } |
122 | 0 | return(NULL); |
123 | | |
124 | 0 | found_content: |
125 | 0 | encoding = xmlStrstr(content, BAD_CAST"charset="); |
126 | 0 | if (encoding == NULL) |
127 | 0 | encoding = xmlStrstr(content, BAD_CAST"Charset="); |
128 | 0 | if (encoding == NULL) |
129 | 0 | encoding = xmlStrstr(content, BAD_CAST"CHARSET="); |
130 | 0 | if (encoding != NULL) { |
131 | 0 | encoding += 8; |
132 | 0 | } else { |
133 | 0 | encoding = xmlStrstr(content, BAD_CAST"charset ="); |
134 | 0 | if (encoding == NULL) |
135 | 0 | encoding = xmlStrstr(content, BAD_CAST"Charset ="); |
136 | 0 | if (encoding == NULL) |
137 | 0 | encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); |
138 | 0 | if (encoding != NULL) |
139 | 0 | encoding += 9; |
140 | 0 | } |
141 | 0 | if (encoding != NULL) { |
142 | 0 | while ((*encoding == ' ') || (*encoding == '\t')) encoding++; |
143 | 0 | } |
144 | 0 | return(encoding); |
145 | 0 | } |
146 | | |
147 | | /** |
148 | | * htmlSetMetaEncoding: |
149 | | * @doc: the document |
150 | | * @encoding: the encoding string |
151 | | * |
152 | | * Sets the current encoding in the Meta tags |
153 | | * NOTE: this will not change the document content encoding, just |
154 | | * the META flag associated. |
155 | | * |
156 | | * Returns 0 in case of success and -1 in case of error |
157 | | */ |
158 | | int |
159 | 0 | htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { |
160 | 0 | htmlNodePtr cur, meta = NULL, head = NULL; |
161 | 0 | const xmlChar *content = NULL; |
162 | 0 | char newcontent[100]; |
163 | |
|
164 | 0 | newcontent[0] = 0; |
165 | |
|
166 | 0 | if (doc == NULL) |
167 | 0 | return(-1); |
168 | | |
169 | | /* html isn't a real encoding it's just libxml2 way to get entities */ |
170 | 0 | if (!xmlStrcasecmp(encoding, BAD_CAST "html")) |
171 | 0 | return(-1); |
172 | | |
173 | 0 | if (encoding != NULL) { |
174 | 0 | snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", |
175 | 0 | (char *)encoding); |
176 | 0 | newcontent[sizeof(newcontent) - 1] = 0; |
177 | 0 | } |
178 | |
|
179 | 0 | cur = doc->children; |
180 | | |
181 | | /* |
182 | | * Search the html |
183 | | */ |
184 | 0 | while (cur != NULL) { |
185 | 0 | if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { |
186 | 0 | if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) |
187 | 0 | break; |
188 | 0 | if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) |
189 | 0 | goto found_head; |
190 | 0 | if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) |
191 | 0 | goto found_meta; |
192 | 0 | } |
193 | 0 | cur = cur->next; |
194 | 0 | } |
195 | 0 | if (cur == NULL) |
196 | 0 | return(-1); |
197 | 0 | cur = cur->children; |
198 | | |
199 | | /* |
200 | | * Search the head |
201 | | */ |
202 | 0 | while (cur != NULL) { |
203 | 0 | if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { |
204 | 0 | if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) |
205 | 0 | break; |
206 | 0 | if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { |
207 | 0 | head = cur->parent; |
208 | 0 | goto found_meta; |
209 | 0 | } |
210 | 0 | } |
211 | 0 | cur = cur->next; |
212 | 0 | } |
213 | 0 | if (cur == NULL) |
214 | 0 | return(-1); |
215 | 0 | found_head: |
216 | 0 | head = cur; |
217 | 0 | if (cur->children == NULL) |
218 | 0 | goto create; |
219 | 0 | cur = cur->children; |
220 | |
|
221 | 0 | found_meta: |
222 | | /* |
223 | | * Search and update all the remaining the meta elements carrying |
224 | | * encoding information |
225 | | */ |
226 | 0 | while (cur != NULL) { |
227 | 0 | if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { |
228 | 0 | if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { |
229 | 0 | xmlAttrPtr attr = cur->properties; |
230 | 0 | int http; |
231 | 0 | const xmlChar *value; |
232 | |
|
233 | 0 | content = NULL; |
234 | 0 | http = 0; |
235 | 0 | while (attr != NULL) { |
236 | 0 | if ((attr->children != NULL) && |
237 | 0 | (attr->children->type == XML_TEXT_NODE) && |
238 | 0 | (attr->children->next == NULL)) { |
239 | 0 | value = attr->children->content; |
240 | 0 | if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) |
241 | 0 | && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) |
242 | 0 | http = 1; |
243 | 0 | else |
244 | 0 | { |
245 | 0 | if ((value != NULL) && |
246 | 0 | (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) |
247 | 0 | content = value; |
248 | 0 | } |
249 | 0 | if ((http != 0) && (content != NULL)) |
250 | 0 | break; |
251 | 0 | } |
252 | 0 | attr = attr->next; |
253 | 0 | } |
254 | 0 | if ((http != 0) && (content != NULL)) { |
255 | 0 | meta = cur; |
256 | 0 | break; |
257 | 0 | } |
258 | |
|
259 | 0 | } |
260 | 0 | } |
261 | 0 | cur = cur->next; |
262 | 0 | } |
263 | 0 | create: |
264 | 0 | if (meta == NULL) { |
265 | 0 | if ((encoding != NULL) && (head != NULL)) { |
266 | | /* |
267 | | * Create a new Meta element with the right attributes |
268 | | */ |
269 | |
|
270 | 0 | meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); |
271 | 0 | if (head->children == NULL) |
272 | 0 | xmlAddChild(head, meta); |
273 | 0 | else |
274 | 0 | xmlAddPrevSibling(head->children, meta); |
275 | 0 | xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); |
276 | 0 | xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); |
277 | 0 | } |
278 | 0 | } else { |
279 | | /* remove the meta tag if NULL is passed */ |
280 | 0 | if (encoding == NULL) { |
281 | 0 | xmlUnlinkNode(meta); |
282 | 0 | xmlFreeNode(meta); |
283 | 0 | } |
284 | | /* change the document only if there is a real encoding change */ |
285 | 0 | else if (xmlStrcasestr(content, encoding) == NULL) { |
286 | 0 | xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); |
287 | 0 | } |
288 | 0 | } |
289 | | |
290 | |
|
291 | 0 | return(0); |
292 | 0 | } |
293 | | |
294 | | /** |
295 | | * booleanHTMLAttrs: |
296 | | * |
297 | | * These are the HTML attributes which will be output |
298 | | * in minimized form, i.e. <option selected="selected"> will be |
299 | | * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" |
300 | | * |
301 | | */ |
302 | | static const char* const htmlBooleanAttrs[] = { |
303 | | "checked", "compact", "declare", "defer", "disabled", "ismap", |
304 | | "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", |
305 | | "selected", NULL |
306 | | }; |
307 | | |
308 | | |
309 | | /** |
310 | | * htmlIsBooleanAttr: |
311 | | * @name: the name of the attribute to check |
312 | | * |
313 | | * Determine if a given attribute is a boolean attribute. |
314 | | * |
315 | | * returns: false if the attribute is not boolean, true otherwise. |
316 | | */ |
317 | | int |
318 | | htmlIsBooleanAttr(const xmlChar *name) |
319 | 0 | { |
320 | 0 | int i = 0; |
321 | |
|
322 | 0 | while (htmlBooleanAttrs[i] != NULL) { |
323 | 0 | if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) |
324 | 0 | return 1; |
325 | 0 | i++; |
326 | 0 | } |
327 | 0 | return 0; |
328 | 0 | } |
329 | | |
330 | | #ifdef LIBXML_OUTPUT_ENABLED |
331 | | /* |
332 | | * private routine exported from xmlIO.c |
333 | | */ |
334 | | xmlOutputBufferPtr |
335 | | xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); |
336 | | /************************************************************************ |
337 | | * * |
338 | | * Output error handlers * |
339 | | * * |
340 | | ************************************************************************/ |
341 | | /** |
342 | | * htmlSaveErrMemory: |
343 | | * @extra: extra information |
344 | | * |
345 | | * Handle an out of memory condition |
346 | | */ |
347 | | static void |
348 | | htmlSaveErrMemory(const char *extra) |
349 | 0 | { |
350 | 0 | __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); |
351 | 0 | } |
352 | | |
353 | | /** |
354 | | * htmlSaveErr: |
355 | | * @code: the error number |
356 | | * @node: the location of the error. |
357 | | * @extra: extra information |
358 | | * |
359 | | * Handle an out of memory condition |
360 | | */ |
361 | | static void |
362 | | htmlSaveErr(int code, xmlNodePtr node, const char *extra) |
363 | 0 | { |
364 | 0 | const char *msg = NULL; |
365 | |
|
366 | 0 | switch(code) { |
367 | 0 | case XML_SAVE_NOT_UTF8: |
368 | 0 | msg = "string is not in UTF-8\n"; |
369 | 0 | break; |
370 | 0 | case XML_SAVE_CHAR_INVALID: |
371 | 0 | msg = "invalid character value\n"; |
372 | 0 | break; |
373 | 0 | case XML_SAVE_UNKNOWN_ENCODING: |
374 | 0 | msg = "unknown encoding %s\n"; |
375 | 0 | break; |
376 | 0 | case XML_SAVE_NO_DOCTYPE: |
377 | 0 | msg = "HTML has no DOCTYPE\n"; |
378 | 0 | break; |
379 | 0 | default: |
380 | 0 | msg = "unexpected error number\n"; |
381 | 0 | } |
382 | 0 | __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); |
383 | 0 | } |
384 | | |
385 | | /************************************************************************ |
386 | | * * |
387 | | * Dumping HTML tree content to a simple buffer * |
388 | | * * |
389 | | ************************************************************************/ |
390 | | |
391 | | /** |
392 | | * htmlBufNodeDumpFormat: |
393 | | * @buf: the xmlBufPtr output |
394 | | * @doc: the document |
395 | | * @cur: the current node |
396 | | * @format: should formatting spaces been added |
397 | | * |
398 | | * Dump an HTML node, recursive behaviour,children are printed too. |
399 | | * |
400 | | * Returns the number of byte written or -1 in case of error |
401 | | */ |
402 | | static size_t |
403 | | htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, |
404 | 0 | int format) { |
405 | 0 | size_t use; |
406 | 0 | int ret; |
407 | 0 | xmlOutputBufferPtr outbuf; |
408 | |
|
409 | 0 | if (cur == NULL) { |
410 | 0 | return (-1); |
411 | 0 | } |
412 | 0 | if (buf == NULL) { |
413 | 0 | return (-1); |
414 | 0 | } |
415 | 0 | outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); |
416 | 0 | if (outbuf == NULL) { |
417 | 0 | htmlSaveErrMemory("allocating HTML output buffer"); |
418 | 0 | return (-1); |
419 | 0 | } |
420 | 0 | memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); |
421 | 0 | outbuf->buffer = buf; |
422 | 0 | outbuf->encoder = NULL; |
423 | 0 | outbuf->writecallback = NULL; |
424 | 0 | outbuf->closecallback = NULL; |
425 | 0 | outbuf->context = NULL; |
426 | 0 | outbuf->written = 0; |
427 | |
|
428 | 0 | use = xmlBufUse(buf); |
429 | 0 | htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); |
430 | 0 | xmlFree(outbuf); |
431 | 0 | ret = xmlBufUse(buf) - use; |
432 | 0 | return (ret); |
433 | 0 | } |
434 | | |
435 | | /** |
436 | | * htmlNodeDump: |
437 | | * @buf: the HTML buffer output |
438 | | * @doc: the document |
439 | | * @cur: the current node |
440 | | * |
441 | | * Dump an HTML node, recursive behaviour,children are printed too, |
442 | | * and formatting returns are added. |
443 | | * |
444 | | * Returns the number of byte written or -1 in case of error |
445 | | */ |
446 | | int |
447 | 0 | htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { |
448 | 0 | xmlBufPtr buffer; |
449 | 0 | size_t ret; |
450 | |
|
451 | 0 | if ((buf == NULL) || (cur == NULL)) |
452 | 0 | return(-1); |
453 | | |
454 | 0 | xmlInitParser(); |
455 | 0 | buffer = xmlBufFromBuffer(buf); |
456 | 0 | if (buffer == NULL) |
457 | 0 | return(-1); |
458 | | |
459 | 0 | ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1); |
460 | |
|
461 | 0 | xmlBufBackToBuffer(buffer); |
462 | |
|
463 | 0 | if (ret > INT_MAX) |
464 | 0 | return(-1); |
465 | 0 | return((int) ret); |
466 | 0 | } |
467 | | |
468 | | /** |
469 | | * htmlNodeDumpFileFormat: |
470 | | * @out: the FILE pointer |
471 | | * @doc: the document |
472 | | * @cur: the current node |
473 | | * @encoding: the document encoding |
474 | | * @format: should formatting spaces been added |
475 | | * |
476 | | * Dump an HTML node, recursive behaviour,children are printed too. |
477 | | * |
478 | | * TODO: if encoding == NULL try to save in the doc encoding |
479 | | * |
480 | | * returns: the number of byte written or -1 in case of failure. |
481 | | */ |
482 | | int |
483 | | htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, |
484 | 0 | xmlNodePtr cur, const char *encoding, int format) { |
485 | 0 | xmlOutputBufferPtr buf; |
486 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
487 | 0 | int ret; |
488 | |
|
489 | 0 | xmlInitParser(); |
490 | |
|
491 | 0 | if (encoding != NULL) { |
492 | 0 | xmlCharEncoding enc; |
493 | |
|
494 | 0 | enc = xmlParseCharEncoding(encoding); |
495 | 0 | if (enc != XML_CHAR_ENCODING_UTF8) { |
496 | 0 | handler = xmlFindCharEncodingHandler(encoding); |
497 | 0 | if (handler == NULL) |
498 | 0 | htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); |
499 | 0 | } |
500 | 0 | } else { |
501 | | /* |
502 | | * Fallback to HTML or ASCII when the encoding is unspecified |
503 | | */ |
504 | 0 | if (handler == NULL) |
505 | 0 | handler = xmlFindCharEncodingHandler("HTML"); |
506 | 0 | if (handler == NULL) |
507 | 0 | handler = xmlFindCharEncodingHandler("ascii"); |
508 | 0 | } |
509 | | |
510 | | /* |
511 | | * save the content to a temp buffer. |
512 | | */ |
513 | 0 | buf = xmlOutputBufferCreateFile(out, handler); |
514 | 0 | if (buf == NULL) return(0); |
515 | | |
516 | 0 | htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format); |
517 | |
|
518 | 0 | ret = xmlOutputBufferClose(buf); |
519 | 0 | return(ret); |
520 | 0 | } |
521 | | |
522 | | /** |
523 | | * htmlNodeDumpFile: |
524 | | * @out: the FILE pointer |
525 | | * @doc: the document |
526 | | * @cur: the current node |
527 | | * |
528 | | * Dump an HTML node, recursive behaviour,children are printed too, |
529 | | * and formatting returns are added. |
530 | | */ |
531 | | void |
532 | 0 | htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { |
533 | 0 | htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); |
534 | 0 | } |
535 | | |
536 | | /** |
537 | | * htmlDocDumpMemoryFormat: |
538 | | * @cur: the document |
539 | | * @mem: OUT: the memory pointer |
540 | | * @size: OUT: the memory length |
541 | | * @format: should formatting spaces been added |
542 | | * |
543 | | * Dump an HTML document in memory and return the xmlChar * and it's size. |
544 | | * It's up to the caller to free the memory. |
545 | | */ |
546 | | void |
547 | 0 | htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { |
548 | 0 | xmlOutputBufferPtr buf; |
549 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
550 | 0 | const char *encoding; |
551 | |
|
552 | 0 | xmlInitParser(); |
553 | |
|
554 | 0 | if ((mem == NULL) || (size == NULL)) |
555 | 0 | return; |
556 | 0 | if (cur == NULL) { |
557 | 0 | *mem = NULL; |
558 | 0 | *size = 0; |
559 | 0 | return; |
560 | 0 | } |
561 | | |
562 | 0 | encoding = (const char *) htmlGetMetaEncoding(cur); |
563 | |
|
564 | 0 | if (encoding != NULL) { |
565 | 0 | xmlCharEncoding enc; |
566 | |
|
567 | 0 | enc = xmlParseCharEncoding(encoding); |
568 | 0 | if (enc != XML_CHAR_ENCODING_UTF8) { |
569 | 0 | handler = xmlFindCharEncodingHandler(encoding); |
570 | 0 | if (handler == NULL) |
571 | 0 | htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); |
572 | |
|
573 | 0 | } |
574 | 0 | } else { |
575 | | /* |
576 | | * Fallback to HTML or ASCII when the encoding is unspecified |
577 | | */ |
578 | 0 | if (handler == NULL) |
579 | 0 | handler = xmlFindCharEncodingHandler("HTML"); |
580 | 0 | if (handler == NULL) |
581 | 0 | handler = xmlFindCharEncodingHandler("ascii"); |
582 | 0 | } |
583 | |
|
584 | 0 | buf = xmlAllocOutputBufferInternal(handler); |
585 | 0 | if (buf == NULL) { |
586 | 0 | *mem = NULL; |
587 | 0 | *size = 0; |
588 | 0 | return; |
589 | 0 | } |
590 | | |
591 | 0 | htmlDocContentDumpFormatOutput(buf, cur, NULL, format); |
592 | |
|
593 | 0 | xmlOutputBufferFlush(buf); |
594 | 0 | if (buf->conv != NULL) { |
595 | 0 | *size = xmlBufUse(buf->conv); |
596 | 0 | *mem = xmlStrndup(xmlBufContent(buf->conv), *size); |
597 | 0 | } else { |
598 | 0 | *size = xmlBufUse(buf->buffer); |
599 | 0 | *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); |
600 | 0 | } |
601 | 0 | (void)xmlOutputBufferClose(buf); |
602 | 0 | } |
603 | | |
604 | | /** |
605 | | * htmlDocDumpMemory: |
606 | | * @cur: the document |
607 | | * @mem: OUT: the memory pointer |
608 | | * @size: OUT: the memory length |
609 | | * |
610 | | * Dump an HTML document in memory and return the xmlChar * and it's size. |
611 | | * It's up to the caller to free the memory. |
612 | | */ |
613 | | void |
614 | 0 | htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { |
615 | 0 | htmlDocDumpMemoryFormat(cur, mem, size, 1); |
616 | 0 | } |
617 | | |
618 | | |
619 | | /************************************************************************ |
620 | | * * |
621 | | * Dumping HTML tree content to an I/O output buffer * |
622 | | * * |
623 | | ************************************************************************/ |
624 | | |
625 | | void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); |
626 | | |
627 | | /** |
628 | | * htmlDtdDumpOutput: |
629 | | * @buf: the HTML buffer output |
630 | | * @doc: the document |
631 | | * @encoding: the encoding string |
632 | | * |
633 | | * TODO: check whether encoding is needed |
634 | | * |
635 | | * Dump the HTML document DTD, if any. |
636 | | */ |
637 | | static void |
638 | | htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
639 | 0 | const char *encoding ATTRIBUTE_UNUSED) { |
640 | 0 | xmlDtdPtr cur = doc->intSubset; |
641 | |
|
642 | 0 | if (cur == NULL) { |
643 | 0 | htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); |
644 | 0 | return; |
645 | 0 | } |
646 | 0 | xmlOutputBufferWriteString(buf, "<!DOCTYPE "); |
647 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
648 | 0 | if (cur->ExternalID != NULL) { |
649 | 0 | xmlOutputBufferWriteString(buf, " PUBLIC "); |
650 | 0 | xmlBufWriteQuotedString(buf->buffer, cur->ExternalID); |
651 | 0 | if (cur->SystemID != NULL) { |
652 | 0 | xmlOutputBufferWriteString(buf, " "); |
653 | 0 | xmlBufWriteQuotedString(buf->buffer, cur->SystemID); |
654 | 0 | } |
655 | 0 | } else if (cur->SystemID != NULL && |
656 | 0 | xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { |
657 | 0 | xmlOutputBufferWriteString(buf, " SYSTEM "); |
658 | 0 | xmlBufWriteQuotedString(buf->buffer, cur->SystemID); |
659 | 0 | } |
660 | 0 | xmlOutputBufferWriteString(buf, ">\n"); |
661 | 0 | } |
662 | | |
663 | | /** |
664 | | * htmlAttrDumpOutput: |
665 | | * @buf: the HTML buffer output |
666 | | * @doc: the document |
667 | | * @cur: the attribute pointer |
668 | | * |
669 | | * Dump an HTML attribute |
670 | | */ |
671 | | static void |
672 | 0 | htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { |
673 | 0 | xmlChar *value; |
674 | | |
675 | | /* |
676 | | * The html output method should not escape a & character |
677 | | * occurring in an attribute value immediately followed by |
678 | | * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). |
679 | | * This is implemented in xmlEncodeEntitiesReentrant |
680 | | */ |
681 | |
|
682 | 0 | if (cur == NULL) { |
683 | 0 | return; |
684 | 0 | } |
685 | 0 | xmlOutputBufferWriteString(buf, " "); |
686 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
687 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
688 | 0 | xmlOutputBufferWriteString(buf, ":"); |
689 | 0 | } |
690 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
691 | 0 | if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { |
692 | 0 | value = xmlNodeListGetString(doc, cur->children, 0); |
693 | 0 | if (value) { |
694 | 0 | xmlOutputBufferWriteString(buf, "="); |
695 | 0 | if ((cur->ns == NULL) && (cur->parent != NULL) && |
696 | 0 | (cur->parent->ns == NULL) && |
697 | 0 | ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || |
698 | 0 | (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || |
699 | 0 | (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || |
700 | 0 | ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && |
701 | 0 | (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { |
702 | 0 | xmlChar *escaped; |
703 | 0 | xmlChar *tmp = value; |
704 | |
|
705 | 0 | while (IS_BLANK_CH(*tmp)) tmp++; |
706 | | |
707 | | /* |
708 | | * the < and > have already been escaped at the entity level |
709 | | * And doing so here breaks server side includes |
710 | | */ |
711 | 0 | escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>"); |
712 | 0 | if (escaped != NULL) { |
713 | 0 | xmlBufWriteQuotedString(buf->buffer, escaped); |
714 | 0 | xmlFree(escaped); |
715 | 0 | } else { |
716 | 0 | xmlBufWriteQuotedString(buf->buffer, value); |
717 | 0 | } |
718 | 0 | } else { |
719 | 0 | xmlBufWriteQuotedString(buf->buffer, value); |
720 | 0 | } |
721 | 0 | xmlFree(value); |
722 | 0 | } else { |
723 | 0 | xmlOutputBufferWriteString(buf, "=\"\""); |
724 | 0 | } |
725 | 0 | } |
726 | 0 | } |
727 | | |
728 | | /** |
729 | | * htmlNodeDumpFormatOutput: |
730 | | * @buf: the HTML buffer output |
731 | | * @doc: the document |
732 | | * @cur: the current node |
733 | | * @encoding: the encoding string (unused) |
734 | | * @format: should formatting spaces been added |
735 | | * |
736 | | * Dump an HTML node, recursive behaviour,children are printed too. |
737 | | */ |
738 | | void |
739 | | htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
740 | | xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, |
741 | 0 | int format) { |
742 | 0 | xmlNodePtr root, parent; |
743 | 0 | xmlAttrPtr attr; |
744 | 0 | const htmlElemDesc * info; |
745 | |
|
746 | 0 | xmlInitParser(); |
747 | |
|
748 | 0 | if ((cur == NULL) || (buf == NULL)) { |
749 | 0 | return; |
750 | 0 | } |
751 | | |
752 | 0 | root = cur; |
753 | 0 | parent = cur->parent; |
754 | 0 | while (1) { |
755 | 0 | switch (cur->type) { |
756 | 0 | case XML_HTML_DOCUMENT_NODE: |
757 | 0 | case XML_DOCUMENT_NODE: |
758 | 0 | if (((xmlDocPtr) cur)->intSubset != NULL) { |
759 | 0 | htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); |
760 | 0 | } |
761 | 0 | if (cur->children != NULL) { |
762 | | /* Always validate cur->parent when descending. */ |
763 | 0 | if (cur->parent == parent) { |
764 | 0 | parent = cur; |
765 | 0 | cur = cur->children; |
766 | 0 | continue; |
767 | 0 | } |
768 | 0 | } else { |
769 | 0 | xmlOutputBufferWriteString(buf, "\n"); |
770 | 0 | } |
771 | 0 | break; |
772 | | |
773 | 0 | case XML_ELEMENT_NODE: |
774 | | /* |
775 | | * Some users like lxml are known to pass nodes with a corrupted |
776 | | * tree structure. Fall back to a recursive call to handle this |
777 | | * case. |
778 | | */ |
779 | 0 | if ((cur->parent != parent) && (cur->children != NULL)) { |
780 | 0 | htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); |
781 | 0 | break; |
782 | 0 | } |
783 | | |
784 | | /* |
785 | | * Get specific HTML info for that node. |
786 | | */ |
787 | 0 | if (cur->ns == NULL) |
788 | 0 | info = htmlTagLookup(cur->name); |
789 | 0 | else |
790 | 0 | info = NULL; |
791 | |
|
792 | 0 | xmlOutputBufferWriteString(buf, "<"); |
793 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
794 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
795 | 0 | xmlOutputBufferWriteString(buf, ":"); |
796 | 0 | } |
797 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
798 | 0 | if (cur->nsDef) |
799 | 0 | xmlNsListDumpOutput(buf, cur->nsDef); |
800 | 0 | attr = cur->properties; |
801 | 0 | while (attr != NULL) { |
802 | 0 | htmlAttrDumpOutput(buf, doc, attr); |
803 | 0 | attr = attr->next; |
804 | 0 | } |
805 | |
|
806 | 0 | if ((info != NULL) && (info->empty)) { |
807 | 0 | xmlOutputBufferWriteString(buf, ">"); |
808 | 0 | } else if (cur->children == NULL) { |
809 | 0 | if ((info != NULL) && (info->saveEndTag != 0) && |
810 | 0 | (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && |
811 | 0 | (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { |
812 | 0 | xmlOutputBufferWriteString(buf, ">"); |
813 | 0 | } else { |
814 | 0 | xmlOutputBufferWriteString(buf, "></"); |
815 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
816 | 0 | xmlOutputBufferWriteString(buf, |
817 | 0 | (const char *)cur->ns->prefix); |
818 | 0 | xmlOutputBufferWriteString(buf, ":"); |
819 | 0 | } |
820 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
821 | 0 | xmlOutputBufferWriteString(buf, ">"); |
822 | 0 | } |
823 | 0 | } else { |
824 | 0 | xmlOutputBufferWriteString(buf, ">"); |
825 | 0 | if ((format) && (info != NULL) && (!info->isinline) && |
826 | 0 | (cur->children->type != HTML_TEXT_NODE) && |
827 | 0 | (cur->children->type != HTML_ENTITY_REF_NODE) && |
828 | 0 | (cur->children != cur->last) && |
829 | 0 | (cur->name != NULL) && |
830 | 0 | (cur->name[0] != 'p')) /* p, pre, param */ |
831 | 0 | xmlOutputBufferWriteString(buf, "\n"); |
832 | 0 | parent = cur; |
833 | 0 | cur = cur->children; |
834 | 0 | continue; |
835 | 0 | } |
836 | | |
837 | 0 | if ((format) && (cur->next != NULL) && |
838 | 0 | (info != NULL) && (!info->isinline)) { |
839 | 0 | if ((cur->next->type != HTML_TEXT_NODE) && |
840 | 0 | (cur->next->type != HTML_ENTITY_REF_NODE) && |
841 | 0 | (parent != NULL) && |
842 | 0 | (parent->name != NULL) && |
843 | 0 | (parent->name[0] != 'p')) /* p, pre, param */ |
844 | 0 | xmlOutputBufferWriteString(buf, "\n"); |
845 | 0 | } |
846 | |
|
847 | 0 | break; |
848 | | |
849 | 0 | case XML_ATTRIBUTE_NODE: |
850 | 0 | htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur); |
851 | 0 | break; |
852 | | |
853 | 0 | case HTML_TEXT_NODE: |
854 | 0 | if (cur->content == NULL) |
855 | 0 | break; |
856 | 0 | if (((cur->name == (const xmlChar *)xmlStringText) || |
857 | 0 | (cur->name != (const xmlChar *)xmlStringTextNoenc)) && |
858 | 0 | ((parent == NULL) || |
859 | 0 | ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && |
860 | 0 | (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { |
861 | 0 | xmlChar *buffer; |
862 | |
|
863 | 0 | buffer = xmlEncodeEntitiesReentrant(doc, cur->content); |
864 | 0 | if (buffer != NULL) { |
865 | 0 | xmlOutputBufferWriteString(buf, (const char *)buffer); |
866 | 0 | xmlFree(buffer); |
867 | 0 | } |
868 | 0 | } else { |
869 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
870 | 0 | } |
871 | 0 | break; |
872 | | |
873 | 0 | case HTML_COMMENT_NODE: |
874 | 0 | if (cur->content != NULL) { |
875 | 0 | xmlOutputBufferWriteString(buf, "<!--"); |
876 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
877 | 0 | xmlOutputBufferWriteString(buf, "-->"); |
878 | 0 | } |
879 | 0 | break; |
880 | | |
881 | 0 | case HTML_PI_NODE: |
882 | 0 | if (cur->name != NULL) { |
883 | 0 | xmlOutputBufferWriteString(buf, "<?"); |
884 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
885 | 0 | if (cur->content != NULL) { |
886 | 0 | xmlOutputBufferWriteString(buf, " "); |
887 | 0 | xmlOutputBufferWriteString(buf, |
888 | 0 | (const char *)cur->content); |
889 | 0 | } |
890 | 0 | xmlOutputBufferWriteString(buf, ">"); |
891 | 0 | } |
892 | 0 | break; |
893 | | |
894 | 0 | case HTML_ENTITY_REF_NODE: |
895 | 0 | xmlOutputBufferWriteString(buf, "&"); |
896 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
897 | 0 | xmlOutputBufferWriteString(buf, ";"); |
898 | 0 | break; |
899 | | |
900 | 0 | case HTML_PRESERVE_NODE: |
901 | 0 | if (cur->content != NULL) { |
902 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->content); |
903 | 0 | } |
904 | 0 | break; |
905 | | |
906 | 0 | default: |
907 | 0 | break; |
908 | 0 | } |
909 | | |
910 | 0 | while (1) { |
911 | 0 | if (cur == root) |
912 | 0 | return; |
913 | 0 | if (cur->next != NULL) { |
914 | 0 | cur = cur->next; |
915 | 0 | break; |
916 | 0 | } |
917 | | |
918 | 0 | cur = parent; |
919 | | /* cur->parent was validated when descending. */ |
920 | 0 | parent = cur->parent; |
921 | |
|
922 | 0 | if ((cur->type == XML_HTML_DOCUMENT_NODE) || |
923 | 0 | (cur->type == XML_DOCUMENT_NODE)) { |
924 | 0 | xmlOutputBufferWriteString(buf, "\n"); |
925 | 0 | } else { |
926 | 0 | if ((format) && (cur->ns == NULL)) |
927 | 0 | info = htmlTagLookup(cur->name); |
928 | 0 | else |
929 | 0 | info = NULL; |
930 | |
|
931 | 0 | if ((format) && (info != NULL) && (!info->isinline) && |
932 | 0 | (cur->last->type != HTML_TEXT_NODE) && |
933 | 0 | (cur->last->type != HTML_ENTITY_REF_NODE) && |
934 | 0 | (cur->children != cur->last) && |
935 | 0 | (cur->name != NULL) && |
936 | 0 | (cur->name[0] != 'p')) /* p, pre, param */ |
937 | 0 | xmlOutputBufferWriteString(buf, "\n"); |
938 | |
|
939 | 0 | xmlOutputBufferWriteString(buf, "</"); |
940 | 0 | if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { |
941 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); |
942 | 0 | xmlOutputBufferWriteString(buf, ":"); |
943 | 0 | } |
944 | 0 | xmlOutputBufferWriteString(buf, (const char *)cur->name); |
945 | 0 | xmlOutputBufferWriteString(buf, ">"); |
946 | |
|
947 | 0 | if ((format) && (info != NULL) && (!info->isinline) && |
948 | 0 | (cur->next != NULL)) { |
949 | 0 | if ((cur->next->type != HTML_TEXT_NODE) && |
950 | 0 | (cur->next->type != HTML_ENTITY_REF_NODE) && |
951 | 0 | (parent != NULL) && |
952 | 0 | (parent->name != NULL) && |
953 | 0 | (parent->name[0] != 'p')) /* p, pre, param */ |
954 | 0 | xmlOutputBufferWriteString(buf, "\n"); |
955 | 0 | } |
956 | 0 | } |
957 | 0 | } |
958 | 0 | } |
959 | 0 | } |
960 | | |
961 | | /** |
962 | | * htmlNodeDumpOutput: |
963 | | * @buf: the HTML buffer output |
964 | | * @doc: the document |
965 | | * @cur: the current node |
966 | | * @encoding: the encoding string (unused) |
967 | | * |
968 | | * Dump an HTML node, recursive behaviour,children are printed too, |
969 | | * and formatting returns/spaces are added. |
970 | | */ |
971 | | void |
972 | | htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
973 | 0 | xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) { |
974 | 0 | htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1); |
975 | 0 | } |
976 | | |
977 | | /** |
978 | | * htmlDocContentDumpFormatOutput: |
979 | | * @buf: the HTML buffer output |
980 | | * @cur: the document |
981 | | * @encoding: the encoding string (unused) |
982 | | * @format: should formatting spaces been added |
983 | | * |
984 | | * Dump an HTML document. |
985 | | */ |
986 | | void |
987 | | htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, |
988 | | const char *encoding ATTRIBUTE_UNUSED, |
989 | 0 | int format) { |
990 | 0 | int type = 0; |
991 | 0 | if (cur) { |
992 | 0 | type = cur->type; |
993 | 0 | cur->type = XML_HTML_DOCUMENT_NODE; |
994 | 0 | } |
995 | 0 | htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format); |
996 | 0 | if (cur) |
997 | 0 | cur->type = (xmlElementType) type; |
998 | 0 | } |
999 | | |
1000 | | /** |
1001 | | * htmlDocContentDumpOutput: |
1002 | | * @buf: the HTML buffer output |
1003 | | * @cur: the document |
1004 | | * @encoding: the encoding string (unused) |
1005 | | * |
1006 | | * Dump an HTML document. Formatting return/spaces are added. |
1007 | | */ |
1008 | | void |
1009 | | htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, |
1010 | 0 | const char *encoding ATTRIBUTE_UNUSED) { |
1011 | 0 | htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1); |
1012 | 0 | } |
1013 | | |
1014 | | /************************************************************************ |
1015 | | * * |
1016 | | * Saving functions front-ends * |
1017 | | * * |
1018 | | ************************************************************************/ |
1019 | | |
1020 | | /** |
1021 | | * htmlDocDump: |
1022 | | * @f: the FILE* |
1023 | | * @cur: the document |
1024 | | * |
1025 | | * Dump an HTML document to an open FILE. |
1026 | | * |
1027 | | * returns: the number of byte written or -1 in case of failure. |
1028 | | */ |
1029 | | int |
1030 | 0 | htmlDocDump(FILE *f, xmlDocPtr cur) { |
1031 | 0 | xmlOutputBufferPtr buf; |
1032 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
1033 | 0 | const char *encoding; |
1034 | 0 | int ret; |
1035 | |
|
1036 | 0 | xmlInitParser(); |
1037 | |
|
1038 | 0 | if ((cur == NULL) || (f == NULL)) { |
1039 | 0 | return(-1); |
1040 | 0 | } |
1041 | | |
1042 | 0 | encoding = (const char *) htmlGetMetaEncoding(cur); |
1043 | |
|
1044 | 0 | if (encoding != NULL) { |
1045 | 0 | xmlCharEncoding enc; |
1046 | |
|
1047 | 0 | enc = xmlParseCharEncoding(encoding); |
1048 | 0 | if (enc != XML_CHAR_ENCODING_UTF8) { |
1049 | 0 | handler = xmlFindCharEncodingHandler(encoding); |
1050 | 0 | if (handler == NULL) |
1051 | 0 | htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); |
1052 | 0 | } |
1053 | 0 | } else { |
1054 | | /* |
1055 | | * Fallback to HTML or ASCII when the encoding is unspecified |
1056 | | */ |
1057 | 0 | if (handler == NULL) |
1058 | 0 | handler = xmlFindCharEncodingHandler("HTML"); |
1059 | 0 | if (handler == NULL) |
1060 | 0 | handler = xmlFindCharEncodingHandler("ascii"); |
1061 | 0 | } |
1062 | |
|
1063 | 0 | buf = xmlOutputBufferCreateFile(f, handler); |
1064 | 0 | if (buf == NULL) return(-1); |
1065 | 0 | htmlDocContentDumpOutput(buf, cur, NULL); |
1066 | |
|
1067 | 0 | ret = xmlOutputBufferClose(buf); |
1068 | 0 | return(ret); |
1069 | 0 | } |
1070 | | |
1071 | | /** |
1072 | | * htmlSaveFile: |
1073 | | * @filename: the filename (or URL) |
1074 | | * @cur: the document |
1075 | | * |
1076 | | * Dump an HTML document to a file. If @filename is "-" the stdout file is |
1077 | | * used. |
1078 | | * returns: the number of byte written or -1 in case of failure. |
1079 | | */ |
1080 | | int |
1081 | 0 | htmlSaveFile(const char *filename, xmlDocPtr cur) { |
1082 | 0 | xmlOutputBufferPtr buf; |
1083 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
1084 | 0 | const char *encoding; |
1085 | 0 | int ret; |
1086 | |
|
1087 | 0 | if ((cur == NULL) || (filename == NULL)) |
1088 | 0 | return(-1); |
1089 | | |
1090 | 0 | xmlInitParser(); |
1091 | |
|
1092 | 0 | encoding = (const char *) htmlGetMetaEncoding(cur); |
1093 | |
|
1094 | 0 | if (encoding != NULL) { |
1095 | 0 | xmlCharEncoding enc; |
1096 | |
|
1097 | 0 | enc = xmlParseCharEncoding(encoding); |
1098 | 0 | if (enc != XML_CHAR_ENCODING_UTF8) { |
1099 | 0 | handler = xmlFindCharEncodingHandler(encoding); |
1100 | 0 | if (handler == NULL) |
1101 | 0 | htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); |
1102 | 0 | } |
1103 | 0 | } else { |
1104 | | /* |
1105 | | * Fallback to HTML or ASCII when the encoding is unspecified |
1106 | | */ |
1107 | 0 | if (handler == NULL) |
1108 | 0 | handler = xmlFindCharEncodingHandler("HTML"); |
1109 | 0 | if (handler == NULL) |
1110 | 0 | handler = xmlFindCharEncodingHandler("ascii"); |
1111 | 0 | } |
1112 | | |
1113 | | /* |
1114 | | * save the content to a temp buffer. |
1115 | | */ |
1116 | 0 | buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); |
1117 | 0 | if (buf == NULL) return(0); |
1118 | | |
1119 | 0 | htmlDocContentDumpOutput(buf, cur, NULL); |
1120 | |
|
1121 | 0 | ret = xmlOutputBufferClose(buf); |
1122 | 0 | return(ret); |
1123 | 0 | } |
1124 | | |
1125 | | /** |
1126 | | * htmlSaveFileFormat: |
1127 | | * @filename: the filename |
1128 | | * @cur: the document |
1129 | | * @format: should formatting spaces been added |
1130 | | * @encoding: the document encoding |
1131 | | * |
1132 | | * Dump an HTML document to a file using a given encoding. |
1133 | | * |
1134 | | * returns: the number of byte written or -1 in case of failure. |
1135 | | */ |
1136 | | int |
1137 | | htmlSaveFileFormat(const char *filename, xmlDocPtr cur, |
1138 | 0 | const char *encoding, int format) { |
1139 | 0 | xmlOutputBufferPtr buf; |
1140 | 0 | xmlCharEncodingHandlerPtr handler = NULL; |
1141 | 0 | int ret; |
1142 | |
|
1143 | 0 | if ((cur == NULL) || (filename == NULL)) |
1144 | 0 | return(-1); |
1145 | | |
1146 | 0 | xmlInitParser(); |
1147 | |
|
1148 | 0 | if (encoding != NULL) { |
1149 | 0 | xmlCharEncoding enc; |
1150 | |
|
1151 | 0 | enc = xmlParseCharEncoding(encoding); |
1152 | 0 | if (enc != XML_CHAR_ENCODING_UTF8) { |
1153 | 0 | handler = xmlFindCharEncodingHandler(encoding); |
1154 | 0 | if (handler == NULL) |
1155 | 0 | htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); |
1156 | 0 | } |
1157 | 0 | htmlSetMetaEncoding(cur, (const xmlChar *) encoding); |
1158 | 0 | } else { |
1159 | 0 | htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); |
1160 | | |
1161 | | /* |
1162 | | * Fallback to HTML or ASCII when the encoding is unspecified |
1163 | | */ |
1164 | 0 | if (handler == NULL) |
1165 | 0 | handler = xmlFindCharEncodingHandler("HTML"); |
1166 | 0 | if (handler == NULL) |
1167 | 0 | handler = xmlFindCharEncodingHandler("ascii"); |
1168 | 0 | } |
1169 | | |
1170 | | /* |
1171 | | * save the content to a temp buffer. |
1172 | | */ |
1173 | 0 | buf = xmlOutputBufferCreateFilename(filename, handler, 0); |
1174 | 0 | if (buf == NULL) return(0); |
1175 | | |
1176 | 0 | htmlDocContentDumpFormatOutput(buf, cur, encoding, format); |
1177 | |
|
1178 | 0 | ret = xmlOutputBufferClose(buf); |
1179 | 0 | return(ret); |
1180 | 0 | } |
1181 | | |
1182 | | /** |
1183 | | * htmlSaveFileEnc: |
1184 | | * @filename: the filename |
1185 | | * @cur: the document |
1186 | | * @encoding: the document encoding |
1187 | | * |
1188 | | * Dump an HTML document to a file using a given encoding |
1189 | | * and formatting returns/spaces are added. |
1190 | | * |
1191 | | * returns: the number of byte written or -1 in case of failure. |
1192 | | */ |
1193 | | int |
1194 | 0 | htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { |
1195 | 0 | return(htmlSaveFileFormat(filename, cur, encoding, 1)); |
1196 | 0 | } |
1197 | | |
1198 | | #endif /* LIBXML_OUTPUT_ENABLED */ |
1199 | | |
1200 | | #endif /* LIBXML_HTML_ENABLED */ |