Coverage Report

Created: 2025-08-11 06:23

/src/libxml2/HTMLtree.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLtree.c : implementation of access function for an HTML tree.
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * Author: Daniel Veillard
7
 */
8
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13
14
#include <string.h> /* for memset() only ! */
15
#include <ctype.h>
16
#include <stdlib.h>
17
18
#include <libxml/xmlmemory.h>
19
#include <libxml/HTMLparser.h>
20
#include <libxml/HTMLtree.h>
21
#include <libxml/entities.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/uri.h>
25
26
#include "private/buf.h"
27
#include "private/html.h"
28
#include "private/error.h"
29
#include "private/html.h"
30
#include "private/io.h"
31
#include "private/save.h"
32
#include "private/tree.h"
33
34
/************************************************************************
35
 *                  *
36
 *    Getting/Setting encoding meta tags      *
37
 *                  *
38
 ************************************************************************/
39
40
typedef struct {
41
    xmlAttrPtr attr; /* charset or content */
42
    const xmlChar *attrValue;
43
    htmlMetaEncodingOffsets off;
44
} htmlMetaEncoding;
45
46
static htmlNodePtr
47
9.04k
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
48
9.04k
    htmlNodePtr child;
49
50
21.3k
    for (child = parent->children; child != NULL; child = child->next) {
51
19.9k
        if ((child->type == XML_ELEMENT_NODE) &&
52
19.9k
            (xmlStrcasecmp(child->name, BAD_CAST name) == 0))
53
7.61k
            return(child);
54
19.9k
    }
55
56
1.42k
    return(NULL);
57
9.04k
}
58
59
static htmlNodePtr
60
5.39k
htmlFindHead(htmlDocPtr doc) {
61
5.39k
    htmlNodePtr html;
62
63
5.39k
    if (doc == NULL)
64
466
        return(NULL);
65
66
4.92k
    html = htmlFindFirstChild((htmlNodePtr) doc, "html");
67
4.92k
    if (html == NULL)
68
804
        return(NULL);
69
70
4.12k
    return(htmlFindFirstChild(html, "head"));
71
4.92k
}
72
73
int
74
2.69k
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
75
2.69k
    const xmlChar *p = val;
76
77
2.69k
    while (1) {
78
2.69k
        size_t start, end;
79
80
32.3k
        while ((*p != 'c') && (*p != 'C')) {
81
29.6k
            if (*p == 0)
82
0
                return(0);
83
29.6k
            p += 1;
84
29.6k
        }
85
2.69k
        p += 1;
86
87
2.69k
        if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
88
0
            continue;
89
90
2.69k
        p += 6;
91
2.69k
        while (IS_WS_HTML(*p)) p += 1;
92
93
2.69k
        if (*p != '=')
94
0
            continue;
95
96
2.69k
        p += 1;
97
2.89k
        while (IS_WS_HTML(*p)) p += 1;
98
99
2.69k
        if (*p == 0)
100
385
            return(0);
101
102
2.30k
        if ((*p == '"') || (*p == '\'')) {
103
1.20k
            int quote = *p;
104
105
1.20k
            p += 1;
106
1.40k
            while (IS_WS_HTML(*p)) p += 1;
107
108
1.20k
            start = p - val;
109
1.20k
            end = start;
110
111
2.80k
            while (*p != quote) {
112
2.39k
                if (*p == 0)
113
792
                    return(0);
114
1.60k
                if (!IS_WS_HTML(*p))
115
1.37k
                    end = p + 1 - val;
116
1.60k
                p += 1;
117
1.60k
            }
118
1.20k
        } else {
119
1.10k
            start = p - val;
120
121
4.82k
            while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
122
3.71k
                p += 1;
123
124
1.10k
            end = p - val;
125
1.10k
        }
126
127
1.51k
        off->start = start;
128
1.51k
        off->end = end;
129
1.51k
        off->size = p - val + strlen((char *) p);
130
131
1.51k
        return(1);
132
2.30k
    }
133
134
0
    return(0);
135
2.69k
}
136
137
static xmlAttrPtr
138
3.28k
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
139
3.28k
    xmlAttrPtr attr, contentAttr = NULL;
140
3.28k
    int isContentType = 0;
141
142
3.28k
    if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
143
0
        return(NULL);
144
145
9.12k
    for (attr = elem->properties; attr != NULL; attr = attr->next) {
146
5.84k
        if (attr->ns != NULL)
147
243
            continue;
148
5.59k
        if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
149
0
            *outIsContentType = 0;
150
0
            return(attr);
151
0
        }
152
5.59k
        if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
153
2.69k
            contentAttr = attr;
154
5.59k
        if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
155
5.59k
            (attr->children != NULL) &&
156
5.59k
            (attr->children->type == XML_TEXT_NODE) &&
157
5.59k
            (attr->children->next == NULL) &&
158
5.59k
            (xmlStrcasecmp(attr->children->content,
159
2.76k
                           BAD_CAST "Content-Type") == 0))
160
2.76k
            isContentType = 1;
161
5.59k
    }
162
163
3.28k
    if ((isContentType) && (contentAttr != NULL)) {
164
2.69k
        *outIsContentType = 1;
165
2.69k
        return(contentAttr);
166
2.69k
    }
167
168
588
    return(NULL);
169
3.28k
}
170
171
static int
172
4.56k
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
173
4.56k
    xmlAttrPtr attr;
174
4.56k
    const xmlChar *val = NULL;
175
4.56k
    int isContentType;
176
177
4.56k
    if ((elem->type != XML_ELEMENT_NODE) ||
178
4.56k
        (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
179
1.28k
        return(0);
180
181
3.28k
    attr = htmlFindMetaEncodingAttr(elem, &isContentType);
182
3.28k
    if (attr == NULL)
183
588
        return(0);
184
185
2.69k
    if ((attr->children != NULL) &&
186
2.69k
        (attr->children->type == XML_TEXT_NODE) &&
187
2.69k
        (attr->children->next == NULL) &&
188
2.69k
        (attr->children->content != NULL))
189
2.69k
        val = attr->children->content;
190
0
    else
191
0
        val = BAD_CAST "";
192
193
194
2.69k
    if (!isContentType) {
195
0
        size_t size = strlen((char *) val);
196
0
        size_t start = 0;
197
0
        size_t end = size;
198
199
0
        while ((start < size) && (IS_WS_HTML(val[start])))
200
0
            start += 1;
201
202
0
        while ((end > 0) && (IS_WS_HTML(val[end-1])))
203
0
            end -= 1;
204
205
0
        menc->attr = attr;
206
0
        menc->attrValue = val;
207
0
        menc->off.start = start;
208
0
        menc->off.end = end;
209
0
        menc->off.size = size;
210
211
0
        return(1);
212
2.69k
    } else {
213
2.69k
        if (htmlParseContentType(val, &menc->off)) {
214
1.51k
            menc->attr = attr;
215
1.51k
            menc->attrValue = val;
216
217
1.51k
            return(1);
218
1.51k
        }
219
2.69k
    }
220
221
1.17k
    return(0);
222
2.69k
}
223
224
static xmlChar *
225
0
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
226
0
    xmlChar *newVal, *p;
227
0
    size_t size, oldEncSize, newEncSize;
228
229
    /*
230
     * The pseudo "HTML" encoding only produces ASCII.
231
     */
232
0
    if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
233
0
        encoding = "ASCII";
234
235
0
    oldEncSize = menc->off.end - menc->off.start;
236
0
    newEncSize = strlen((char *) encoding);
237
0
    size = menc->off.size - oldEncSize + newEncSize;
238
0
    newVal = xmlMalloc(size + 1);
239
0
    if (newVal == NULL)
240
0
        return(NULL);
241
242
0
    p = newVal;
243
0
    memcpy(p, menc->attrValue, menc->off.start);
244
0
    p += menc->off.start;
245
0
    memcpy(p, encoding, newEncSize);
246
0
    p += newEncSize;
247
0
    memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
248
0
    newVal[size] = 0;
249
250
0
    return(newVal);
251
0
}
252
253
/**
254
 * Look up and encoding declaration in the meta tags.
255
 *
256
 * The returned string points into attribute content and can contain
257
 * trailing garbage. It should be copied before modifying or freeing
258
 * nodes.
259
 *
260
 * @param doc  the document
261
 * @returns the encoding ot NULL if not found.
262
 */
263
const xmlChar *
264
5.39k
htmlGetMetaEncoding(xmlDoc *doc) {
265
5.39k
    htmlNodePtr head, node;
266
267
5.39k
    head = htmlFindHead(doc);
268
5.39k
    if (head == NULL)
269
1.89k
        return(NULL);
270
271
6.54k
    for (node = head->children; node != NULL; node = node->next) {
272
4.56k
        htmlMetaEncoding menc;
273
274
4.56k
        if (htmlParseMetaEncoding(node, &menc)) {
275
            /*
276
             * Returning a `const xmlChar *` only allows to return
277
             * a suffix. In http-equiv meta tags, there could be
278
             * more data after the charset, although it's probably
279
             * rare in practice.
280
             */
281
1.51k
            return(menc.attrValue + menc.off.start);
282
1.51k
        }
283
4.56k
    }
284
285
1.98k
    return(NULL);
286
3.49k
}
287
288
/**
289
 * Creates or updates a meta tag with an encoding declaration.
290
 *
291
 * NOTE: This will not change the document content encoding.
292
 *
293
 * @param doc  the document
294
 * @param encoding  the encoding string
295
 * @returns 0 in case of success, 1 if no head element was found or
296
 * arguments are invalid and -1 if memory allocation failed.
297
 */
298
int
299
0
htmlSetMetaEncoding(xmlDoc *doc, const xmlChar *encoding) {
300
0
    htmlNodePtr head, meta;
301
0
    int found = 0;
302
303
0
    if (encoding == NULL)
304
0
        return(1);
305
306
0
    head = htmlFindHead(doc);
307
0
    if (head == NULL)
308
0
        return(1);
309
310
0
    for (meta = head->children; meta != NULL; meta = meta->next) {
311
0
        htmlMetaEncoding menc;
312
313
0
        if (htmlParseMetaEncoding(meta, &menc)) {
314
0
            xmlChar *newVal;
315
0
            int ret;
316
317
0
            found = 1;
318
319
0
            newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
320
0
            if (newVal == NULL)
321
0
                return(-1);
322
0
            xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
323
0
            ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
324
0
            xmlFree(newVal);
325
326
0
            if (ret < 0)
327
0
                return(-1);
328
0
        }
329
0
    }
330
331
0
    if (found)
332
0
        return(0);
333
334
0
    meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
335
0
    if (meta == NULL)
336
0
        return(-1);
337
338
0
    if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
339
0
        xmlFreeNode(meta);
340
0
        return(-1);
341
0
    }
342
343
0
    if (head->children == NULL)
344
0
        xmlAddChild(head, meta);
345
0
    else
346
0
        xmlAddPrevSibling(head->children, meta);
347
348
0
    return(0);
349
0
}
350
351
/**
352
 * Determine if a given attribute is a boolean attribute. This
353
 * doesn't handle HTML5.
354
 *
355
 * @deprecated Internal function, don't use.
356
 *
357
 * @param name  the name of the attribute to check
358
 * @returns false if the attribute is not boolean, true otherwise.
359
 */
360
int
361
htmlIsBooleanAttr(const xmlChar *name)
362
17.6k
{
363
17.6k
    const char *str = NULL;
364
365
17.6k
    if (name == NULL)
366
256
        return(0);
367
368
    /*
369
     * These are the HTML attributes which will be output
370
     * in minimized form, i.e. `<option selected="selected">` will be
371
     * output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output
372
     * Method":
373
     *
374
     * "checked", "compact", "declare", "defer", "disabled", "ismap",
375
     * "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
376
     * "selected"
377
     *
378
     * Additional attributes from HTML5 (not implemented yet):
379
     *
380
     * "allowfullscreen", "alpha", "async", "autofocus", "autoplay",
381
     * "controls", "default", "formnovalidate", "inert", "itemscope",
382
     * "loop", "muted", "nomodule", "novalidate", "open", "playsinline",
383
     * "required", "reversed", "shadowrootdelegatesfocus",
384
     * "shadowrootclonable", "shadowrootserializable",
385
     * "shadowrootcustomelementregistry", "truespeed"
386
     */
387
388
17.3k
    switch (name[0] | 0x20) {
389
658
        case 'c':
390
658
            name += 1;
391
658
            switch (name[0] | 0x20) {
392
199
                case 'h': str = "ecked"; break;
393
261
                case 'o': str = "mpact"; break;
394
658
            }
395
658
            break;
396
989
        case 'd':
397
989
            name += 1;
398
989
            switch (name[0] | 0x20) {
399
596
                case 'e':
400
596
                    name += 1;
401
596
                    switch (name[0] | 0x20) {
402
198
                        case 'c': str = "lare"; break;
403
199
                        case 'f': str = "er"; break;
404
596
                    }
405
596
                    break;
406
596
                case 'i': str = "sabled"; break;
407
989
            }
408
989
            break;
409
989
        case 'i':
410
418
            str = "smap";
411
418
            break;
412
308
        case 'm':
413
308
            str = "ultiple";
414
308
            break;
415
1.55k
        case 'n':
416
1.55k
            name += 1;
417
1.55k
            if ((name[0] | 0x20) != 'o')
418
289
                break;
419
1.26k
            name += 1;
420
1.26k
            switch (name[0] | 0x20) {
421
487
                case 'h': str = "ref"; break;
422
196
                case 'r': str = "esize"; break;
423
194
                case 's': str = "hade"; break;
424
194
                case 'w': str = "rap"; break;
425
1.26k
            }
426
1.26k
            break;
427
1.26k
        case 'r':
428
204
            str = "eadonly";
429
204
            break;
430
1.62k
        case 's':
431
1.62k
            str = "elected";
432
1.62k
            break;
433
17.3k
    }
434
435
17.3k
    if (str == NULL)
436
12.7k
        return(0);
437
438
4.67k
    return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0);
439
17.3k
}
440
441
#ifdef LIBXML_OUTPUT_ENABLED
442
/************************************************************************
443
 *                  *
444
 *    Dumping HTML tree content to a simple buffer    *
445
 *                  *
446
 ************************************************************************/
447
448
static xmlParserErrors
449
7.23k
htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
450
    /*
451
     * Fallback to HTML if the encoding is unspecified
452
     */
453
7.23k
    if (encoding == NULL)
454
6.51k
        encoding = "HTML";
455
456
7.23k
    return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
457
7.23k
}
458
459
/**
460
 * Serialize an HTML document to an xmlBuf.
461
 *
462
 * @param buf  the xmlBuf output
463
 * @param doc  the document (unused)
464
 * @param cur  the current node
465
 * @param format  should formatting newlines been added
466
 * @returns the number of bytes written or -1 in case of error
467
 */
468
static size_t
469
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
470
4.62k
                      xmlNodePtr cur, int format) {
471
4.62k
    size_t use;
472
4.62k
    size_t ret;
473
4.62k
    xmlOutputBufferPtr outbuf;
474
475
4.62k
    if (cur == NULL) {
476
0
  return ((size_t) -1);
477
0
    }
478
4.62k
    if (buf == NULL) {
479
0
  return ((size_t) -1);
480
0
    }
481
4.62k
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
482
4.62k
    if (outbuf == NULL)
483
1
  return ((size_t) -1);
484
4.62k
    memset(outbuf, 0, sizeof(xmlOutputBuffer));
485
4.62k
    outbuf->buffer = buf;
486
4.62k
    outbuf->encoder = NULL;
487
4.62k
    outbuf->writecallback = NULL;
488
4.62k
    outbuf->closecallback = NULL;
489
4.62k
    outbuf->context = NULL;
490
4.62k
    outbuf->written = 0;
491
492
4.62k
    use = xmlBufUse(buf);
493
4.62k
    htmlNodeDumpInternal(outbuf, cur, NULL, format);
494
4.62k
    if (outbuf->error)
495
2
        ret = (size_t) -1;
496
4.61k
    else
497
4.61k
        ret = xmlBufUse(buf) - use;
498
4.62k
    xmlFree(outbuf);
499
4.62k
    return (ret);
500
4.62k
}
501
502
/**
503
 * Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
504
 *
505
 * @param buf  the HTML buffer output
506
 * @param doc  the document
507
 * @param cur  the current node
508
 * @returns the number of bytes written or -1 in case of error
509
 */
510
int
511
5.88k
htmlNodeDump(xmlBuffer *buf, xmlDoc *doc, xmlNode *cur) {
512
5.88k
    xmlBufPtr buffer;
513
5.88k
    size_t ret1;
514
5.88k
    int ret2;
515
516
5.88k
    if ((buf == NULL) || (cur == NULL))
517
1.25k
        return(-1);
518
519
4.62k
    xmlInitParser();
520
4.62k
    buffer = xmlBufFromBuffer(buf);
521
4.62k
    if (buffer == NULL)
522
2
        return(-1);
523
524
4.62k
    ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
525
526
4.62k
    ret2 = xmlBufBackToBuffer(buffer, buf);
527
528
4.62k
    if ((ret1 == (size_t) -1) || (ret2 < 0))
529
3
        return(-1);
530
4.61k
    return(ret1 > INT_MAX ? INT_MAX : ret1);
531
4.62k
}
532
533
/**
534
 * Serialize an HTML node to an xmlBuffer.
535
 *
536
 * If encoding is NULL, ASCII with HTML 4.0 named character entities
537
 * will be used. This is inefficient compared to UTF-8 and might be
538
 * changed in a future version.
539
 *
540
 * @param out  the FILE pointer
541
 * @param doc  the document (unused)
542
 * @param cur  the current node
543
 * @param encoding  the document encoding (optional)
544
 * @param format  should formatting newlines been added
545
 * @returns the number of bytes written or -1 in case of failure.
546
 */
547
int
548
htmlNodeDumpFileFormat(FILE *out, xmlDoc *doc ATTRIBUTE_UNUSED,
549
0
                 xmlNode *cur, const char *encoding, int format) {
550
0
    xmlOutputBufferPtr buf;
551
0
    xmlCharEncodingHandlerPtr handler;
552
0
    int ret;
553
554
0
    xmlInitParser();
555
556
    /*
557
     * save the content to a temp buffer.
558
     */
559
0
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
560
0
        return(-1);
561
0
    buf = xmlOutputBufferCreateFile(out, handler);
562
0
    if (buf == NULL) {
563
0
        xmlCharEncCloseFunc(handler);
564
0
        return(-1);
565
0
    }
566
567
0
    htmlNodeDumpInternal(buf, cur, NULL, format);
568
569
0
    ret = xmlOutputBufferClose(buf);
570
0
    return(ret);
571
0
}
572
573
/**
574
 * Same as #htmlNodeDumpFileFormat with `format` set to 1 which is
575
 * typically undesired. Use of this function is DISCOURAGED in favor
576
 * of #htmlNodeDumpFileFormat.
577
 *
578
 * @param out  the FILE pointer
579
 * @param doc  the document
580
 * @param cur  the current node
581
 */
582
void
583
0
htmlNodeDumpFile(FILE *out, xmlDoc *doc, xmlNode *cur) {
584
0
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
585
0
}
586
587
/**
588
 * Serialize an HTML node to a memory, also returning the size of
589
 * the result. It's up to the caller to free the memory.
590
 *
591
 * Uses the encoding of the document. If the document has no
592
 * encoding, ASCII with HTML 4.0 named character entities will
593
 * be used. This is inefficient compared to UTF-8 and might be
594
 * changed in a future version.
595
 *
596
 * @param cur  the document
597
 * @param mem  OUT: the memory pointer
598
 * @param size  OUT: the memory length
599
 * @param format  should formatting newlines been added
600
 */
601
void
602
7.89k
htmlDocDumpMemoryFormat(xmlDoc *cur, xmlChar**mem, int *size, int format) {
603
7.89k
    xmlOutputBufferPtr buf;
604
7.89k
    xmlCharEncodingHandlerPtr handler = NULL;
605
606
7.89k
    xmlInitParser();
607
608
7.89k
    if ((mem == NULL) || (size == NULL))
609
0
        return;
610
7.89k
    *mem = NULL;
611
7.89k
    *size = 0;
612
7.89k
    if (cur == NULL)
613
662
  return;
614
615
7.23k
    if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
616
4
        return;
617
7.23k
    buf = xmlAllocOutputBuffer(handler);
618
7.23k
    if (buf == NULL) {
619
2
        xmlCharEncCloseFunc(handler);
620
2
  return;
621
2
    }
622
623
7.22k
    htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
624
625
7.22k
    xmlOutputBufferFlush(buf);
626
627
7.22k
    if (!buf->error) {
628
7.01k
        if (buf->conv != NULL) {
629
6.80k
            *size = xmlBufUse(buf->conv);
630
6.80k
            *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
631
6.80k
        } else {
632
203
            *size = xmlBufUse(buf->buffer);
633
203
            *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
634
203
        }
635
7.01k
    }
636
637
7.22k
    xmlOutputBufferClose(buf);
638
7.22k
}
639
640
/**
641
 * Same as #htmlDocDumpMemoryFormat with `format` set to 1 which
642
 * is typically undesired. Also see the warnings there. Use of
643
 * this function is DISCOURAGED in favor of
644
 * #htmlDocContentDumpFormatOutput.
645
 *
646
 * @param cur  the document
647
 * @param mem  OUT: the memory pointer
648
 * @param size  OUT: the memory length
649
 */
650
void
651
4.55k
htmlDocDumpMemory(xmlDoc *cur, xmlChar**mem, int *size) {
652
4.55k
    htmlDocDumpMemoryFormat(cur, mem, size, 1);
653
4.55k
}
654
655
656
/************************************************************************
657
 *                  *
658
 *    Dumping HTML tree content to an I/O output buffer *
659
 *                  *
660
 ************************************************************************/
661
662
/**
663
 * Serialize the HTML document's DTD, if any.
664
 *
665
 * Ignores `encoding` and uses the encoding of the output buffer.
666
 *
667
 * @param buf  the HTML buffer output
668
 * @param doc  the document
669
 * @param encoding  the encoding string (unused)
670
 */
671
static void
672
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
673
10.0k
            const char *encoding ATTRIBUTE_UNUSED) {
674
10.0k
    xmlDtdPtr cur = doc->intSubset;
675
676
10.0k
    if (cur == NULL)
677
0
  return;
678
10.0k
    xmlOutputBufferWrite(buf, 10, "<!DOCTYPE ");
679
10.0k
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
680
10.0k
    if (cur->ExternalID != NULL) {
681
5.97k
  xmlOutputBufferWrite(buf, 8, " PUBLIC ");
682
5.97k
  xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
683
5.97k
  if (cur->SystemID != NULL) {
684
5.57k
      xmlOutputBufferWrite(buf, 1, " ");
685
5.57k
      xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
686
5.57k
  }
687
5.97k
    } else if (cur->SystemID != NULL &&
688
4.08k
         xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
689
3.68k
  xmlOutputBufferWrite(buf, 8, " SYSTEM ");
690
3.68k
  xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
691
3.68k
    }
692
10.0k
    xmlOutputBufferWrite(buf, 2, ">\n");
693
10.0k
}
694
695
static void
696
846
htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) {
697
846
    const xmlChar *tmp = content;
698
699
    /*
700
     * See appendix "B.2.1 Non-ASCII characters in URI attribute
701
     * values" in the HTML 4.01 spec. This is also recommended
702
     * by the HTML output method of the XSLT 1.0 spec.
703
     *
704
     * We also escape space and control chars.
705
     */
706
707
    /* Skip over initial whitespace */
708
1.46k
    while (IS_WS_HTML(*tmp)) tmp++;
709
846
    if (tmp > content) {
710
265
        xmlOutputBufferWrite(buf, tmp - content, (char *) content);
711
265
        content = tmp;
712
265
    }
713
714
1.88k
    while (1) {
715
1.88k
        char escbuf[3];
716
1.88k
        const char *repl;
717
1.88k
        int replSize;
718
1.88k
        int c = *tmp;
719
720
5.34k
        while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) {
721
3.45k
            tmp += 1;
722
3.45k
            c = *tmp;
723
3.45k
        }
724
725
1.88k
        if (tmp > content)
726
827
            xmlOutputBufferWrite(buf, tmp - content, (char *) content);
727
728
1.88k
        if ((c <= 0x20) || (c >= 0x7F)) {
729
1.36k
            static const char hex[16] = {
730
1.36k
                '0', '1', '2', '3', '4', '5', '6', '7',
731
1.36k
                '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
732
1.36k
            };
733
734
1.36k
            if (c == 0)
735
846
                break;
736
737
518
            escbuf[0] = '%';
738
518
            escbuf[1] = hex[(c >> 4) & 0x0F];
739
518
            escbuf[2] = hex[c & 0x0F];
740
518
            repl = escbuf;
741
518
            replSize = 3;
742
520
        } else if (c == '"') {
743
302
            repl = "&quot;";
744
302
            replSize = 6;
745
302
        } else {
746
218
            repl = "&amp;";
747
218
            replSize = 5;
748
218
        }
749
750
1.03k
        xmlOutputBufferWrite(buf, replSize, repl);
751
1.03k
        tmp += 1;
752
1.03k
        content = tmp;
753
1.03k
    }
754
846
}
755
756
/**
757
 * Serialize an HTML attribute.
758
 *
759
 * @param buf  the HTML buffer output
760
 * @param cur  the attribute pointer
761
 */
762
static void
763
17.6k
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) {
764
17.6k
    xmlOutputBufferWrite(buf, 1, " ");
765
766
17.6k
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
767
9.35k
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
768
9.35k
        xmlOutputBufferWrite(buf, 1, ":");
769
9.35k
    }
770
17.6k
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
771
772
    /*
773
     * The HTML5 spec requires to always serialize empty attribute
774
     * values as `=""`. We should probably align with HTML5 at some
775
     * point.
776
     */
777
17.6k
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
778
15.1k
        xmlNodePtr child;
779
15.1k
        int isUri;
780
781
15.1k
        xmlOutputBufferWrite(buf, 2, "=\"");
782
783
        /*
784
         * Special handling of URIs doesn't conform to HTML5 and
785
         * should probably be removed at some point.
786
         */
787
15.1k
        isUri = (cur->ns == NULL) && (cur->parent != NULL) &&
788
15.1k
                (cur->parent->ns == NULL) &&
789
15.1k
                ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
790
4.77k
                 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
791
4.77k
                 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
792
4.77k
                 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
793
3.93k
                  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))));
794
795
32.4k
        for (child = cur->children; child != NULL; child = child->next) {
796
17.3k
            if (child->type == XML_TEXT_NODE) {
797
16.2k
                const xmlChar *content = child->content;
798
799
16.2k
                if (content == NULL)
800
262
                    continue;
801
802
15.9k
                if (isUri) {
803
846
                    htmlSerializeUri(buf, content);
804
15.1k
                } else {
805
15.1k
                    xmlSerializeText(buf, content, SIZE_MAX,
806
15.1k
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
807
15.1k
                }
808
15.9k
            } else if (child->type == XML_ENTITY_REF_NODE) {
809
                /* TODO: We should probably expand entity refs */
810
1.13k
                xmlOutputBufferWrite(buf, 1, "&");
811
1.13k
                xmlOutputBufferWriteString(buf, (char *) child->name);
812
1.13k
                xmlOutputBufferWrite(buf, 1, ";");
813
1.13k
            }
814
17.3k
        }
815
816
15.1k
        xmlOutputBufferWrite(buf, 1, "\"");
817
15.1k
    }
818
17.6k
}
819
820
/**
821
 * Serialize an HTML node to an output buffer.
822
 *
823
 * If `encoding` is specified, it is used to create or update meta
824
 * tags containing the character encoding.
825
 *
826
 * @param buf  the HTML buffer output
827
 * @param cur  the current node
828
 * @param encoding  the encoding string (optional)
829
 * @param format  should formatting newlines been added
830
 */
831
void
832
htmlNodeDumpInternal(xmlOutputBuffer *buf, xmlNode *cur,
833
28.5k
                     const char *encoding, int format) {
834
28.5k
    xmlNodePtr root, parent, metaHead = NULL;
835
28.5k
    xmlAttrPtr attr;
836
28.5k
    const htmlElemDesc * info;
837
28.5k
    int isRaw = 0;
838
839
28.5k
    xmlInitParser();
840
841
28.5k
    if ((cur == NULL) || (buf == NULL)) {
842
1.46k
  return;
843
1.46k
    }
844
845
27.0k
    root = cur;
846
27.0k
    parent = cur->parent;
847
176k
    while (1) {
848
176k
        switch (cur->type) {
849
9.71k
        case XML_HTML_DOCUMENT_NODE:
850
13.2k
        case XML_DOCUMENT_NODE:
851
13.2k
            if (((xmlDocPtr) cur)->intSubset != NULL) {
852
10.0k
                htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
853
10.0k
            }
854
13.2k
            if (cur->children != NULL) {
855
                /* Always validate cur->parent when descending. */
856
12.1k
                if (cur->parent == parent) {
857
12.1k
                    parent = cur;
858
12.1k
                    cur = cur->children;
859
12.1k
                    continue;
860
12.1k
                }
861
12.1k
            } else {
862
1.15k
                xmlOutputBufferWrite(buf, 1, "\n");
863
1.15k
            }
864
1.15k
            break;
865
866
82.7k
        case XML_ELEMENT_NODE: {
867
82.7k
            htmlMetaEncoding menc;
868
82.7k
            int isMeta = 0;
869
82.7k
            int addMeta = 0;
870
871
            /*
872
             * Some users like lxml are known to pass nodes with a corrupted
873
             * tree structure. Fall back to a recursive call to handle this
874
             * case.
875
             */
876
82.7k
            if ((cur->parent != parent) && (cur->children != NULL)) {
877
0
                htmlNodeDumpInternal(buf, cur, encoding, format);
878
0
                break;
879
0
            }
880
881
            /*
882
             * Get specific HTML info for that node.
883
             */
884
82.7k
            if (cur->ns == NULL)
885
42.1k
                info = htmlTagLookup(cur->name);
886
40.6k
            else
887
40.6k
                info = NULL;
888
889
82.7k
            if (encoding != NULL) {
890
0
                isMeta = htmlParseMetaEncoding(cur, &menc);
891
892
                /*
893
                 * Don't add meta tag for "HTML" encoding.
894
                 */
895
0
                if ((xmlStrcasecmp(BAD_CAST encoding,
896
0
                                   BAD_CAST "HTML") != 0) &&
897
0
                    (xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
898
0
                    (parent != NULL) &&
899
0
                    (xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
900
0
                    (parent->parent != NULL) &&
901
0
                    (parent->parent->parent == NULL) &&
902
0
                    (metaHead == NULL)) {
903
0
                    xmlNodePtr n;
904
905
0
                    metaHead = cur;
906
0
                    addMeta = 1;
907
908
0
                    for (n = cur->children; n != NULL; n = n->next) {
909
0
                        int unused;
910
911
0
                        if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
912
0
                            metaHead = NULL;
913
0
                            addMeta = 0;
914
0
                            break;
915
0
                        }
916
0
                    }
917
0
                }
918
0
            }
919
920
82.7k
            xmlOutputBufferWrite(buf, 1, "<");
921
82.7k
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
922
37.0k
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
923
37.0k
                xmlOutputBufferWrite(buf, 1, ":");
924
37.0k
            }
925
82.7k
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
926
82.7k
            if (cur->nsDef)
927
33.0k
                xmlNsListDumpOutput(buf, cur->nsDef);
928
82.7k
            attr = cur->properties;
929
98.6k
            while (attr != NULL) {
930
15.9k
                if ((!isMeta) || (attr != menc.attr)) {
931
15.9k
                    htmlAttrDumpOutput(buf, attr);
932
15.9k
                } else {
933
0
                    xmlOutputBufferWrite(buf, 1, " ");
934
0
                    xmlOutputBufferWriteString(buf, (char *) attr->name);
935
936
0
                    xmlOutputBufferWrite(buf, 2, "=\"");
937
0
                    xmlSerializeText(buf, menc.attrValue, menc.off.start,
938
0
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
939
0
                    xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
940
0
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
941
0
                    xmlSerializeText(buf, menc.attrValue + menc.off.end,
942
0
                                     menc.off.size - menc.off.end,
943
0
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
944
0
                    xmlOutputBufferWrite(buf, 1, "\"");
945
0
                }
946
15.9k
                attr = attr->next;
947
15.9k
            }
948
949
82.7k
            if ((info != NULL) && (info->empty)) {
950
224
                xmlOutputBufferWrite(buf, 1, ">");
951
82.5k
            } else if (cur->children == NULL) {
952
11.7k
                if (addMeta) {
953
0
                    xmlOutputBufferWrite(buf, 16, "><meta charset=\"");
954
0
                    xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
955
0
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
956
0
                    xmlOutputBufferWrite(buf, 4, "\"></");
957
11.7k
                } else {
958
11.7k
                    xmlOutputBufferWrite(buf, 3, "></");
959
11.7k
                }
960
11.7k
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
961
1.14k
                    xmlOutputBufferWriteString(buf,
962
1.14k
                            (const char *)cur->ns->prefix);
963
1.14k
                    xmlOutputBufferWrite(buf, 1, ":");
964
1.14k
                }
965
11.7k
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
966
11.7k
                xmlOutputBufferWrite(buf, 1, ">");
967
70.7k
            } else {
968
70.7k
                xmlOutputBufferWrite(buf, 1, ">");
969
70.7k
                if ((format) &&
970
70.7k
                    ((addMeta) ||
971
67.2k
                     ((info != NULL) && (!info->isinline) &&
972
67.2k
                      (cur->children->type != HTML_TEXT_NODE) &&
973
67.2k
                      (cur->children->type != HTML_ENTITY_REF_NODE) &&
974
67.2k
                      (cur->children != cur->last) &&
975
67.2k
                      (cur->name != NULL) &&
976
67.2k
                      (cur->name[0] != 'p')))) /* p, pre, param */
977
989
                    xmlOutputBufferWrite(buf, 1, "\n");
978
70.7k
                if (addMeta) {
979
0
                    xmlOutputBufferWrite(buf, 15, "<meta charset=\"");
980
0
                    xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
981
0
                                     XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
982
0
                    xmlOutputBufferWrite(buf, 2, "\">");
983
0
                    if ((format) &&
984
0
                        (cur->children->type != HTML_TEXT_NODE) &&
985
0
                        (cur->children->type != HTML_ENTITY_REF_NODE))
986
0
                        xmlOutputBufferWrite(buf, 1, "\n");
987
0
                }
988
989
70.7k
                if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT))
990
195
                    isRaw = 1;
991
992
70.7k
                parent = cur;
993
70.7k
                cur = cur->children;
994
70.7k
                continue;
995
70.7k
            }
996
997
11.9k
            if ((format) && (cur->next != NULL) &&
998
11.9k
                (info != NULL) && (!info->isinline)) {
999
1.81k
                if ((cur->next->type != HTML_TEXT_NODE) &&
1000
1.81k
                    (cur->next->type != HTML_ENTITY_REF_NODE) &&
1001
1.81k
                    (parent != NULL) &&
1002
1.81k
                    (parent->name != NULL) &&
1003
1.81k
                    (parent->name[0] != 'p')) /* p, pre, param */
1004
900
                    xmlOutputBufferWrite(buf, 1, "\n");
1005
1.81k
            }
1006
1007
11.9k
            break;
1008
82.7k
        }
1009
1010
1.70k
        case XML_ATTRIBUTE_NODE:
1011
1.70k
            htmlAttrDumpOutput(buf, (xmlAttrPtr) cur);
1012
1.70k
            break;
1013
1014
58.6k
        case HTML_TEXT_NODE:
1015
58.6k
            if (cur->content == NULL)
1016
421
                break;
1017
58.1k
            if ((cur->name == (const xmlChar *)xmlStringTextNoenc) ||
1018
58.1k
                (isRaw)) {
1019
195
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
1020
57.9k
            } else {
1021
57.9k
                xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML);
1022
57.9k
            }
1023
58.1k
            break;
1024
1025
1.95k
        case HTML_COMMENT_NODE:
1026
1.95k
            if (cur->content != NULL) {
1027
1.72k
                xmlOutputBufferWrite(buf, 4, "<!--");
1028
1.72k
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
1029
1.72k
                xmlOutputBufferWrite(buf, 3, "-->");
1030
1.72k
            }
1031
1.95k
            break;
1032
1033
1.50k
        case HTML_PI_NODE:
1034
1.50k
            if (cur->name != NULL) {
1035
1.30k
                xmlOutputBufferWrite(buf, 2, "<?");
1036
1.30k
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
1037
1.30k
                if (cur->content != NULL) {
1038
1.00k
                    xmlOutputBufferWrite(buf, 1, " ");
1039
1.00k
                    xmlOutputBufferWriteString(buf,
1040
1.00k
                            (const char *)cur->content);
1041
1.00k
                }
1042
1.30k
                xmlOutputBufferWrite(buf, 1, ">");
1043
1.30k
            }
1044
1.50k
            break;
1045
1046
3.76k
        case HTML_ENTITY_REF_NODE:
1047
3.76k
            xmlOutputBufferWrite(buf, 1, "&");
1048
3.76k
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
1049
3.76k
            xmlOutputBufferWrite(buf, 1, ";");
1050
3.76k
            break;
1051
1052
449
        case HTML_PRESERVE_NODE:
1053
449
            if (cur->content != NULL) {
1054
240
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
1055
240
            }
1056
449
            break;
1057
1058
12.4k
        default:
1059
12.4k
            break;
1060
176k
        }
1061
1062
176k
        while (1) {
1063
176k
            if (cur == root)
1064
27.0k
                return;
1065
149k
            if (cur->next != NULL) {
1066
66.5k
                cur = cur->next;
1067
66.5k
                break;
1068
66.5k
            }
1069
1070
82.8k
            isRaw = 0;
1071
1072
82.8k
            cur = parent;
1073
            /* cur->parent was validated when descending. */
1074
82.8k
            parent = cur->parent;
1075
1076
82.8k
            if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
1077
82.8k
                (cur->type == XML_DOCUMENT_NODE)) {
1078
12.1k
                xmlOutputBufferWrite(buf, 1, "\n");
1079
70.7k
            } else {
1080
70.7k
                if ((format) && (cur->ns == NULL))
1081
29.1k
                    info = htmlTagLookup(cur->name);
1082
41.6k
                else
1083
41.6k
                    info = NULL;
1084
1085
70.7k
                if ((format) && (info != NULL) && (!info->isinline) &&
1086
70.7k
                    (cur->last->type != HTML_TEXT_NODE) &&
1087
70.7k
                    (cur->last->type != HTML_ENTITY_REF_NODE) &&
1088
70.7k
                    ((cur->children != cur->last) || (cur == metaHead)) &&
1089
70.7k
                    (cur->name != NULL) &&
1090
70.7k
                    (cur->name[0] != 'p')) /* p, pre, param */
1091
947
                    xmlOutputBufferWrite(buf, 1, "\n");
1092
1093
70.7k
                xmlOutputBufferWrite(buf, 2, "</");
1094
70.7k
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
1095
35.8k
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
1096
35.8k
                    xmlOutputBufferWrite(buf, 1, ":");
1097
35.8k
                }
1098
70.7k
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
1099
70.7k
                xmlOutputBufferWrite(buf, 1, ">");
1100
1101
70.7k
                if ((format) && (info != NULL) && (!info->isinline) &&
1102
70.7k
                    (cur->next != NULL)) {
1103
2.07k
                    if ((cur->next->type != HTML_TEXT_NODE) &&
1104
2.07k
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
1105
2.07k
                        (parent != NULL) &&
1106
2.07k
                        (parent->name != NULL) &&
1107
2.07k
                        (parent->name[0] != 'p')) /* p, pre, param */
1108
410
                        xmlOutputBufferWrite(buf, 1, "\n");
1109
2.07k
                }
1110
1111
70.7k
                if (cur == metaHead)
1112
0
                    metaHead = NULL;
1113
70.7k
            }
1114
82.8k
        }
1115
93.5k
    }
1116
27.0k
}
1117
1118
/**
1119
 * Serialize an HTML node to an output buffer.
1120
 *
1121
 * @param buf  the HTML buffer output
1122
 * @param doc  the document (unused)
1123
 * @param cur  the current node
1124
 * @param encoding  the encoding string (unused)
1125
 * @param format  should formatting newlines been added
1126
 */
1127
void
1128
htmlNodeDumpFormatOutput(xmlOutputBuffer *buf,
1129
                         xmlDoc *doc ATTRIBUTE_UNUSED, xmlNode *cur,
1130
3.09k
                         const char *encoding ATTRIBUTE_UNUSED, int format) {
1131
3.09k
    htmlNodeDumpInternal(buf, cur, NULL, format);
1132
3.09k
}
1133
1134
/**
1135
 * Same as #htmlNodeDumpFormatOutput with `format` set to 1 which is
1136
 * typically undesired. Use of this function is DISCOURAGED in favor
1137
 * of #htmlNodeDumpFormatOutput.
1138
 *
1139
 * @param buf  the HTML buffer output
1140
 * @param doc  the document (unused)
1141
 * @param cur  the current node
1142
 * @param encoding  the encoding string (unused)
1143
 */
1144
void
1145
htmlNodeDumpOutput(xmlOutputBuffer *buf, xmlDoc *doc ATTRIBUTE_UNUSED,
1146
7.02k
                   xmlNode *cur, const char *encoding ATTRIBUTE_UNUSED) {
1147
7.02k
    htmlNodeDumpInternal(buf, cur, NULL, 1);
1148
7.02k
}
1149
1150
/**
1151
 * Serialize an HTML document to an output buffer.
1152
 *
1153
 * @param buf  the HTML buffer output
1154
 * @param cur  the document
1155
 * @param encoding  the encoding string (unused)
1156
 * @param format  should formatting newlines been added
1157
 */
1158
void
1159
htmlDocContentDumpFormatOutput(xmlOutputBuffer *buf, xmlDoc *cur,
1160
                         const char *encoding ATTRIBUTE_UNUSED,
1161
9.44k
                               int format) {
1162
9.44k
    htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format);
1163
9.44k
}
1164
1165
/**
1166
 * Same as #htmlDocContentDumpFormatOutput with `format` set to 1
1167
 * which is typically undesired. Use of this function is DISCOURAGED
1168
 * in favor of #htmlDocContentDumpFormatOutput.
1169
 *
1170
 * @param buf  the HTML buffer output
1171
 * @param cur  the document
1172
 * @param encoding  the encoding string (unused)
1173
 */
1174
void
1175
htmlDocContentDumpOutput(xmlOutputBuffer *buf, xmlDoc *cur,
1176
2.19k
                   const char *encoding ATTRIBUTE_UNUSED) {
1177
2.19k
    htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1);
1178
2.19k
}
1179
1180
/************************************************************************
1181
 *                  *
1182
 *    Saving functions front-ends       *
1183
 *                  *
1184
 ************************************************************************/
1185
1186
/**
1187
 * Serialize an HTML document to an open `FILE`.
1188
 *
1189
 * Uses the encoding of the document. If the document has no
1190
 * encoding, ASCII with HTML 4.0 named character entities will
1191
 * be used. This is inefficient compared to UTF-8 and might be
1192
 * changed in a future version.
1193
 *
1194
 * Enables "formatting" unconditionally which is typically
1195
 * undesired.
1196
 *
1197
 * Use of this function is DISCOURAGED in favor of
1198
 * #htmlNodeDumpFileFormat.
1199
 *
1200
 * @param f  the FILE*
1201
 * @param cur  the document
1202
 * @returns the number of bytes written or -1 in case of failure.
1203
 */
1204
int
1205
0
htmlDocDump(FILE *f, xmlDoc *cur) {
1206
0
    xmlOutputBufferPtr buf;
1207
0
    xmlCharEncodingHandlerPtr handler = NULL;
1208
0
    int ret;
1209
1210
0
    xmlInitParser();
1211
1212
0
    if ((cur == NULL) || (f == NULL)) {
1213
0
  return(-1);
1214
0
    }
1215
1216
0
    if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
1217
0
        return(-1);
1218
0
    buf = xmlOutputBufferCreateFile(f, handler);
1219
0
    if (buf == NULL) {
1220
0
        xmlCharEncCloseFunc(handler);
1221
0
        return(-1);
1222
0
    }
1223
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1224
1225
0
    ret = xmlOutputBufferClose(buf);
1226
0
    return(ret);
1227
0
}
1228
1229
/**
1230
 * Serialize an HTML document to a file.
1231
 *
1232
 * Same as #htmlSaveFileFormat with `encoding` set to NULL and
1233
 * `format` set to 1 which is typically undesired.
1234
 *
1235
 * Use of this function is DISCOURAGED in favor of
1236
 * #htmlSaveFileFormat.
1237
 *
1238
 * @param filename  the filename (or URL)
1239
 * @param cur  the document
1240
 * @returns the number of bytes written or -1 in case of failure.
1241
 */
1242
int
1243
0
htmlSaveFile(const char *filename, xmlDoc *cur) {
1244
0
    return(htmlSaveFileFormat(filename, cur, NULL, 1));
1245
0
}
1246
1247
/**
1248
 * Serialize an HTML document to a file using a given encoding.
1249
 *
1250
 * If `filename` is `"-"`, stdout is used. This is potentially
1251
 * insecure and might be changed in a future version.
1252
 *
1253
 * If encoding is NULL, ASCII with HTML 4.0 named character entities
1254
 * will be used. This is inefficient compared to UTF-8 and might be
1255
 * changed in a future version.
1256
 *
1257
 * Sets or updates meta tags containing the character encoding.
1258
 *
1259
 * @param filename  the filename
1260
 * @param cur  the document
1261
 * @param format  should formatting newlines been added
1262
 * @param encoding  the document encoding (optional)
1263
 * @returns the number of bytes written or -1 in case of failure.
1264
 */
1265
int
1266
htmlSaveFileFormat(const char *filename, xmlDoc *cur,
1267
0
             const char *encoding, int format) {
1268
0
    xmlOutputBufferPtr buf;
1269
0
    xmlCharEncodingHandlerPtr handler = NULL;
1270
0
    int ret;
1271
1272
0
    if ((cur == NULL) || (filename == NULL))
1273
0
        return(-1);
1274
1275
0
    xmlInitParser();
1276
1277
0
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
1278
0
        return(-1);
1279
1280
    /*
1281
     * save the content to a temp buffer.
1282
     */
1283
0
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1284
0
    if (buf == NULL) {
1285
0
        xmlCharEncCloseFunc(handler);
1286
0
        return(0);
1287
0
    }
1288
1289
0
    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1290
1291
0
    ret = xmlOutputBufferClose(buf);
1292
0
    return(ret);
1293
0
}
1294
1295
/**
1296
 * Serialize an HTML document to a file.
1297
 *
1298
 * Same as #htmlSaveFileFormat with `format` set to 1 which is
1299
 * typically undesired. Also see the warnings there. Use of this
1300
 * function is DISCOURAGED in favor of #htmlSaveFileFormat.
1301
 *
1302
 * @param filename  the filename
1303
 * @param cur  the document
1304
 * @param encoding  the document encoding
1305
 * @returns the number of bytes written or -1 in case of failure.
1306
 */
1307
int
1308
0
htmlSaveFileEnc(const char *filename, xmlDoc *cur, const char *encoding) {
1309
0
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1310
0
}
1311
1312
#endif /* LIBXML_OUTPUT_ENABLED */
1313
1314
#endif /* LIBXML_HTML_ENABLED */