Coverage Report

Created: 2025-07-18 06:31

/src/libxml2/HTMLtree.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLtree.c : implementation of access function for an HTML tree.
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13
14
#include <string.h> /* for memset() only ! */
15
#include <ctype.h>
16
#include <stdlib.h>
17
18
#include <libxml/xmlmemory.h>
19
#include <libxml/HTMLparser.h>
20
#include <libxml/HTMLtree.h>
21
#include <libxml/entities.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/uri.h>
25
26
#include "private/buf.h"
27
#include "private/error.h"
28
#include "private/io.h"
29
#include "private/save.h"
30
31
/************************************************************************
32
 *                  *
33
 *    Getting/Setting encoding meta tags      *
34
 *                  *
35
 ************************************************************************/
36
37
/**
38
 * htmlGetMetaEncoding:
39
 * @doc:  the document
40
 *
41
 * Encoding definition lookup in the Meta tags
42
 *
43
 * Returns the current encoding as flagged in the HTML source
44
 */
45
const xmlChar *
46
0
htmlGetMetaEncoding(htmlDocPtr doc) {
47
0
    htmlNodePtr cur;
48
0
    const xmlChar *content;
49
0
    const xmlChar *encoding;
50
51
0
    if (doc == NULL)
52
0
  return(NULL);
53
0
    cur = doc->children;
54
55
    /*
56
     * Search the html
57
     */
58
0
    while (cur != NULL) {
59
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60
0
      if (xmlStrEqual(cur->name, BAD_CAST"html"))
61
0
    break;
62
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
63
0
    goto found_head;
64
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65
0
    goto found_meta;
66
0
  }
67
0
  cur = cur->next;
68
0
    }
69
0
    if (cur == NULL)
70
0
  return(NULL);
71
0
    cur = cur->children;
72
73
    /*
74
     * Search the head
75
     */
76
0
    while (cur != NULL) {
77
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
79
0
    break;
80
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81
0
    goto found_meta;
82
0
  }
83
0
  cur = cur->next;
84
0
    }
85
0
    if (cur == NULL)
86
0
  return(NULL);
87
0
found_head:
88
0
    cur = cur->children;
89
90
    /*
91
     * Search the meta elements
92
     */
93
0
found_meta:
94
0
    while (cur != NULL) {
95
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97
0
    xmlAttrPtr attr = cur->properties;
98
0
    int http;
99
0
    const xmlChar *value;
100
101
0
    content = NULL;
102
0
    http = 0;
103
0
    while (attr != NULL) {
104
0
        if ((attr->children != NULL) &&
105
0
            (attr->children->type == XML_TEXT_NODE) &&
106
0
            (attr->children->next == NULL)) {
107
0
      value = attr->children->content;
108
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110
0
          http = 1;
111
0
      else if ((value != NULL)
112
0
       && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113
0
          content = value;
114
0
      if ((http != 0) && (content != NULL))
115
0
          goto found_content;
116
0
        }
117
0
        attr = attr->next;
118
0
    }
119
0
      }
120
0
  }
121
0
  cur = cur->next;
122
0
    }
123
0
    return(NULL);
124
125
0
found_content:
126
0
    encoding = xmlStrstr(content, BAD_CAST"charset=");
127
0
    if (encoding == NULL)
128
0
  encoding = xmlStrstr(content, BAD_CAST"Charset=");
129
0
    if (encoding == NULL)
130
0
  encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131
0
    if (encoding != NULL) {
132
0
  encoding += 8;
133
0
    } else {
134
0
  encoding = xmlStrstr(content, BAD_CAST"charset =");
135
0
  if (encoding == NULL)
136
0
      encoding = xmlStrstr(content, BAD_CAST"Charset =");
137
0
  if (encoding == NULL)
138
0
      encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139
0
  if (encoding != NULL)
140
0
      encoding += 9;
141
0
    }
142
0
    if (encoding != NULL) {
143
0
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144
0
    }
145
0
    return(encoding);
146
0
}
147
148
/**
149
 * htmlSetMetaEncoding:
150
 * @doc:  the document
151
 * @encoding:  the encoding string
152
 *
153
 * Sets the current encoding in the Meta tags
154
 * NOTE: this will not change the document content encoding, just
155
 * the META flag associated.
156
 *
157
 * Returns 0 in case of success and -1 in case of error
158
 */
159
int
160
600
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161
600
    htmlNodePtr cur, meta = NULL, head = NULL;
162
600
    const xmlChar *content = NULL;
163
600
    char newcontent[100];
164
165
600
    newcontent[0] = 0;
166
167
600
    if (doc == NULL)
168
0
  return(-1);
169
170
    /* html isn't a real encoding it's just libxml2 way to get entities */
171
600
    if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172
0
        return(-1);
173
174
600
    if (encoding != NULL) {
175
600
  snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176
600
                (char *)encoding);
177
600
  newcontent[sizeof(newcontent) - 1] = 0;
178
600
    }
179
180
600
    cur = doc->children;
181
182
    /*
183
     * Search the html
184
     */
185
14.9k
    while (cur != NULL) {
186
14.7k
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187
10.9k
      if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188
332
    break;
189
10.5k
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190
0
    goto found_head;
191
10.5k
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192
0
    goto found_meta;
193
10.5k
  }
194
14.3k
  cur = cur->next;
195
14.3k
    }
196
600
    if (cur == NULL)
197
268
  return(-1);
198
332
    cur = cur->children;
199
200
    /*
201
     * Search the head
202
     */
203
4.50k
    while (cur != NULL) {
204
4.21k
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205
2.12k
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206
38
    break;
207
2.08k
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208
0
                head = cur->parent;
209
0
    goto found_meta;
210
0
            }
211
2.08k
  }
212
4.17k
  cur = cur->next;
213
4.17k
    }
214
332
    if (cur == NULL)
215
294
  return(-1);
216
38
found_head:
217
38
    head = cur;
218
38
    if (cur->children == NULL)
219
2
        goto create;
220
36
    cur = cur->children;
221
222
36
found_meta:
223
    /*
224
     * Search and update all the remaining the meta elements carrying
225
     * encoding information
226
     */
227
339
    while (cur != NULL) {
228
303
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229
234
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230
0
    xmlAttrPtr attr = cur->properties;
231
0
    int http;
232
0
    const xmlChar *value;
233
234
0
    content = NULL;
235
0
    http = 0;
236
0
    while (attr != NULL) {
237
0
        if ((attr->children != NULL) &&
238
0
            (attr->children->type == XML_TEXT_NODE) &&
239
0
            (attr->children->next == NULL)) {
240
0
      value = attr->children->content;
241
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243
0
          http = 1;
244
0
      else
245
0
                        {
246
0
                           if ((value != NULL) &&
247
0
                               (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248
0
             content = value;
249
0
                        }
250
0
            if ((http != 0) && (content != NULL))
251
0
          break;
252
0
        }
253
0
        attr = attr->next;
254
0
    }
255
0
    if ((http != 0) && (content != NULL)) {
256
0
        meta = cur;
257
0
        break;
258
0
    }
259
260
0
      }
261
234
  }
262
303
  cur = cur->next;
263
303
    }
264
38
create:
265
38
    if (meta == NULL) {
266
38
        if ((encoding != NULL) && (head != NULL)) {
267
            /*
268
             * Create a new Meta element with the right attributes
269
             */
270
271
38
            meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272
38
            if (head->children == NULL)
273
2
                xmlAddChild(head, meta);
274
36
            else
275
36
                xmlAddPrevSibling(head->children, meta);
276
38
            xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277
38
            xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278
38
        }
279
38
    } else {
280
        /* remove the meta tag if NULL is passed */
281
0
        if (encoding == NULL) {
282
0
            xmlUnlinkNode(meta);
283
0
            xmlFreeNode(meta);
284
0
        }
285
        /* change the document only if there is a real encoding change */
286
0
        else if (xmlStrcasestr(content, encoding) == NULL) {
287
0
            xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288
0
        }
289
0
    }
290
291
292
38
    return(0);
293
36
}
294
295
/**
296
 * booleanHTMLAttrs:
297
 *
298
 * These are the HTML attributes which will be output
299
 * in minimized form, i.e. <option selected="selected"> will be
300
 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301
 *
302
 */
303
static const char* const htmlBooleanAttrs[] = {
304
  "checked", "compact", "declare", "defer", "disabled", "ismap",
305
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306
  "selected", NULL
307
};
308
309
310
/**
311
 * htmlIsBooleanAttr:
312
 * @name:  the name of the attribute to check
313
 *
314
 * DEPRECATED: Internal function, don't use.
315
 *
316
 * Determine if a given attribute is a boolean attribute.
317
 *
318
 * returns: false if the attribute is not boolean, true otherwise.
319
 */
320
int
321
htmlIsBooleanAttr(const xmlChar *name)
322
24.9k
{
323
24.9k
    int i = 0;
324
325
348k
    while (htmlBooleanAttrs[i] != NULL) {
326
323k
        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
327
0
            return 1;
328
323k
        i++;
329
323k
    }
330
24.9k
    return 0;
331
24.9k
}
332
333
#ifdef LIBXML_OUTPUT_ENABLED
334
/************************************************************************
335
 *                  *
336
 *    Dumping HTML tree content to a simple buffer    *
337
 *                  *
338
 ************************************************************************/
339
340
static xmlParserErrors
341
0
htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
342
    /*
343
     * Fallback to HTML if the encoding is unspecified
344
     */
345
0
    if (encoding == NULL)
346
0
        encoding = "HTML";
347
348
0
    return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
349
0
}
350
351
/**
352
 * htmlBufNodeDumpFormat:
353
 * @buf:  the xmlBufPtr output
354
 * @doc:  the document
355
 * @cur:  the current node
356
 * @format:  should formatting spaces been added
357
 *
358
 * Dump an HTML node, recursive behaviour,children are printed too.
359
 *
360
 * Returns the number of byte written or -1 in case of error
361
 */
362
static size_t
363
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
364
0
             int format) {
365
0
    size_t use;
366
0
    size_t ret;
367
0
    xmlOutputBufferPtr outbuf;
368
369
0
    if (cur == NULL) {
370
0
  return ((size_t) -1);
371
0
    }
372
0
    if (buf == NULL) {
373
0
  return ((size_t) -1);
374
0
    }
375
0
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
376
0
    if (outbuf == NULL)
377
0
  return ((size_t) -1);
378
0
    memset(outbuf, 0, sizeof(xmlOutputBuffer));
379
0
    outbuf->buffer = buf;
380
0
    outbuf->encoder = NULL;
381
0
    outbuf->writecallback = NULL;
382
0
    outbuf->closecallback = NULL;
383
0
    outbuf->context = NULL;
384
0
    outbuf->written = 0;
385
386
0
    use = xmlBufUse(buf);
387
0
    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
388
0
    if (outbuf->error)
389
0
        ret = (size_t) -1;
390
0
    else
391
0
        ret = xmlBufUse(buf) - use;
392
0
    xmlFree(outbuf);
393
0
    return (ret);
394
0
}
395
396
/**
397
 * htmlNodeDump:
398
 * @buf:  the HTML buffer output
399
 * @doc:  the document
400
 * @cur:  the current node
401
 *
402
 * Dump an HTML node, recursive behaviour,children are printed too,
403
 * and formatting returns are added.
404
 *
405
 * Returns the number of byte written or -1 in case of error
406
 */
407
int
408
0
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
409
0
    xmlBufPtr buffer;
410
0
    size_t ret1;
411
0
    int ret2;
412
413
0
    if ((buf == NULL) || (cur == NULL))
414
0
        return(-1);
415
416
0
    xmlInitParser();
417
0
    buffer = xmlBufFromBuffer(buf);
418
0
    if (buffer == NULL)
419
0
        return(-1);
420
421
0
    ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
422
423
0
    ret2 = xmlBufBackToBuffer(buffer, buf);
424
425
0
    if ((ret1 == (size_t) -1) || (ret2 < 0))
426
0
        return(-1);
427
0
    return(ret1 > INT_MAX ? INT_MAX : ret1);
428
0
}
429
430
/**
431
 * htmlNodeDumpFileFormat:
432
 * @out:  the FILE pointer
433
 * @doc:  the document
434
 * @cur:  the current node
435
 * @encoding: the document encoding
436
 * @format:  should formatting spaces been added
437
 *
438
 * Dump an HTML node, recursive behaviour,children are printed too.
439
 *
440
 * TODO: if encoding == NULL try to save in the doc encoding
441
 *
442
 * returns: the number of byte written or -1 in case of failure.
443
 */
444
int
445
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
446
0
                 xmlNodePtr cur, const char *encoding, int format) {
447
0
    xmlOutputBufferPtr buf;
448
0
    xmlCharEncodingHandlerPtr handler;
449
0
    int ret;
450
451
0
    xmlInitParser();
452
453
    /*
454
     * save the content to a temp buffer.
455
     */
456
0
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
457
0
        return(-1);
458
0
    buf = xmlOutputBufferCreateFile(out, handler);
459
0
    if (buf == NULL) {
460
0
        xmlCharEncCloseFunc(handler);
461
0
        return(-1);
462
0
    }
463
464
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
465
466
0
    ret = xmlOutputBufferClose(buf);
467
0
    return(ret);
468
0
}
469
470
/**
471
 * htmlNodeDumpFile:
472
 * @out:  the FILE pointer
473
 * @doc:  the document
474
 * @cur:  the current node
475
 *
476
 * Dump an HTML node, recursive behaviour,children are printed too,
477
 * and formatting returns are added.
478
 */
479
void
480
0
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
481
0
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
482
0
}
483
484
/**
485
 * htmlDocDumpMemoryFormat:
486
 * @cur:  the document
487
 * @mem:  OUT: the memory pointer
488
 * @size:  OUT: the memory length
489
 * @format:  should formatting spaces been added
490
 *
491
 * Dump an HTML document in memory and return the xmlChar * and it's size.
492
 * It's up to the caller to free the memory.
493
 */
494
void
495
0
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
496
0
    xmlOutputBufferPtr buf;
497
0
    xmlCharEncodingHandlerPtr handler = NULL;
498
0
    const char *encoding;
499
500
0
    xmlInitParser();
501
502
0
    if ((mem == NULL) || (size == NULL))
503
0
        return;
504
0
    *mem = NULL;
505
0
    *size = 0;
506
0
    if (cur == NULL)
507
0
  return;
508
509
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
510
0
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
511
0
        return;
512
0
    buf = xmlAllocOutputBuffer(handler);
513
0
    if (buf == NULL) {
514
0
        xmlCharEncCloseFunc(handler);
515
0
  return;
516
0
    }
517
518
0
    htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
519
520
0
    xmlOutputBufferFlush(buf);
521
522
0
    if (!buf->error) {
523
0
        if (buf->conv != NULL) {
524
0
            *size = xmlBufUse(buf->conv);
525
0
            *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
526
0
        } else {
527
0
            *size = xmlBufUse(buf->buffer);
528
0
            *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
529
0
        }
530
0
    }
531
532
0
    xmlOutputBufferClose(buf);
533
0
}
534
535
/**
536
 * htmlDocDumpMemory:
537
 * @cur:  the document
538
 * @mem:  OUT: the memory pointer
539
 * @size:  OUT: the memory length
540
 *
541
 * Dump an HTML document in memory and return the xmlChar * and it's size.
542
 * It's up to the caller to free the memory.
543
 */
544
void
545
0
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
546
0
  htmlDocDumpMemoryFormat(cur, mem, size, 1);
547
0
}
548
549
550
/************************************************************************
551
 *                  *
552
 *    Dumping HTML tree content to an I/O output buffer *
553
 *                  *
554
 ************************************************************************/
555
556
/**
557
 * htmlDtdDumpOutput:
558
 * @buf:  the HTML buffer output
559
 * @doc:  the document
560
 * @encoding:  the encoding string
561
 *
562
 * TODO: check whether encoding is needed
563
 *
564
 * Dump the HTML document DTD, if any.
565
 */
566
static void
567
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
568
7
            const char *encoding ATTRIBUTE_UNUSED) {
569
7
    xmlDtdPtr cur = doc->intSubset;
570
571
7
    if (cur == NULL)
572
0
  return;
573
7
    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
574
7
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
575
7
    if (cur->ExternalID != NULL) {
576
7
  xmlOutputBufferWriteString(buf, " PUBLIC ");
577
7
  xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
578
7
  if (cur->SystemID != NULL) {
579
2
      xmlOutputBufferWriteString(buf, " ");
580
2
      xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
581
2
  }
582
7
    } else if (cur->SystemID != NULL &&
583
0
         xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
584
0
  xmlOutputBufferWriteString(buf, " SYSTEM ");
585
0
  xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
586
0
    }
587
7
    xmlOutputBufferWriteString(buf, ">\n");
588
7
}
589
590
/**
591
 * htmlAttrDumpOutput:
592
 * @buf:  the HTML buffer output
593
 * @doc:  the document
594
 * @cur:  the attribute pointer
595
 *
596
 * Dump an HTML attribute
597
 */
598
static void
599
24.9k
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
600
24.9k
    xmlChar *value;
601
602
    /*
603
     * The html output method should not escape a & character
604
     * occurring in an attribute value immediately followed by
605
     * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
606
     * This is implemented in xmlEncodeEntitiesReentrant
607
     */
608
609
24.9k
    if (cur == NULL) {
610
0
  return;
611
0
    }
612
24.9k
    xmlOutputBufferWriteString(buf, " ");
613
24.9k
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
614
1.04k
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
615
1.04k
  xmlOutputBufferWriteString(buf, ":");
616
1.04k
    }
617
24.9k
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
618
24.9k
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
619
24.9k
  value = xmlNodeListGetString(doc, cur->children, 0);
620
24.9k
  if (value) {
621
24.8k
      xmlOutputBufferWriteString(buf, "=");
622
24.8k
      if ((cur->ns == NULL) && (cur->parent != NULL) &&
623
24.8k
    (cur->parent->ns == NULL) &&
624
24.8k
    ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
625
22.5k
           (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
626
22.5k
     (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
627
22.5k
     ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
628
22.4k
      (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
629
940
    xmlChar *escaped;
630
940
    xmlChar *tmp = value;
631
632
959
    while (IS_BLANK_CH(*tmp)) tmp++;
633
634
    /*
635
                 * Angle brackets are technically illegal in URIs, but they're
636
                 * used in server side includes, for example. Curly brackets
637
                 * are illegal as well and often used in templates.
638
                 * Don't escape non-whitespace, printable ASCII chars for
639
                 * improved interoperability. Only escape space, control
640
                 * and non-ASCII chars.
641
     */
642
940
    escaped = xmlURIEscapeStr(tmp,
643
940
                        BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
644
940
    if (escaped != NULL) {
645
938
        xmlOutputBufferWriteQuotedString(buf, escaped);
646
938
        xmlFree(escaped);
647
938
    } else {
648
2
                    buf->error = XML_ERR_NO_MEMORY;
649
2
    }
650
23.9k
      } else {
651
23.9k
    xmlOutputBufferWriteQuotedString(buf, value);
652
23.9k
      }
653
24.8k
      xmlFree(value);
654
24.8k
  } else  {
655
31
            buf->error = XML_ERR_NO_MEMORY;
656
31
  }
657
24.9k
    }
658
24.9k
}
659
660
/**
661
 * htmlNodeDumpFormatOutput:
662
 * @buf:  the HTML buffer output
663
 * @doc:  the document
664
 * @cur:  the current node
665
 * @encoding:  the encoding string (unused)
666
 * @format:  should formatting spaces been added
667
 *
668
 * Dump an HTML node, recursive behaviour,children are printed too.
669
 */
670
void
671
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
672
                   xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
673
600
                         int format) {
674
600
    xmlNodePtr root, parent;
675
600
    xmlAttrPtr attr;
676
600
    const htmlElemDesc * info;
677
678
600
    xmlInitParser();
679
680
600
    if ((cur == NULL) || (buf == NULL)) {
681
0
  return;
682
0
    }
683
684
600
    root = cur;
685
600
    parent = cur->parent;
686
88.3k
    while (1) {
687
88.3k
        switch (cur->type) {
688
600
        case XML_HTML_DOCUMENT_NODE:
689
600
        case XML_DOCUMENT_NODE:
690
600
            if (((xmlDocPtr) cur)->intSubset != NULL) {
691
7
                htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
692
7
            }
693
600
            if (cur->children != NULL) {
694
                /* Always validate cur->parent when descending. */
695
600
                if (cur->parent == parent) {
696
600
                    parent = cur;
697
600
                    cur = cur->children;
698
600
                    continue;
699
600
                }
700
600
            } else {
701
0
                xmlOutputBufferWriteString(buf, "\n");
702
0
            }
703
0
            break;
704
705
43.1k
        case XML_ELEMENT_NODE:
706
            /*
707
             * Some users like lxml are known to pass nodes with a corrupted
708
             * tree structure. Fall back to a recursive call to handle this
709
             * case.
710
             */
711
43.1k
            if ((cur->parent != parent) && (cur->children != NULL)) {
712
0
                htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
713
0
                break;
714
0
            }
715
716
            /*
717
             * Get specific HTML info for that node.
718
             */
719
43.1k
            if (cur->ns == NULL)
720
41.5k
                info = htmlTagLookup(cur->name);
721
1.61k
            else
722
1.61k
                info = NULL;
723
724
43.1k
            xmlOutputBufferWriteString(buf, "<");
725
43.1k
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
726
1.20k
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
727
1.20k
                xmlOutputBufferWriteString(buf, ":");
728
1.20k
            }
729
43.1k
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
730
43.1k
            if (cur->nsDef)
731
10.8k
                xmlNsListDumpOutput(buf, cur->nsDef);
732
43.1k
            attr = cur->properties;
733
68.0k
            while (attr != NULL) {
734
24.9k
                htmlAttrDumpOutput(buf, doc, attr);
735
24.9k
                attr = attr->next;
736
24.9k
            }
737
738
43.1k
            if ((info != NULL) && (info->empty)) {
739
115
                xmlOutputBufferWriteString(buf, ">");
740
43.0k
            } else if (cur->children == NULL) {
741
19.9k
                if ((info != NULL) && (info->saveEndTag != 0) &&
742
19.9k
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
743
19.9k
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
744
0
                    xmlOutputBufferWriteString(buf, ">");
745
19.9k
                } else {
746
19.9k
                    xmlOutputBufferWriteString(buf, "></");
747
19.9k
                    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
748
541
                        xmlOutputBufferWriteString(buf,
749
541
                                (const char *)cur->ns->prefix);
750
541
                        xmlOutputBufferWriteString(buf, ":");
751
541
                    }
752
19.9k
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
753
19.9k
                    xmlOutputBufferWriteString(buf, ">");
754
19.9k
                }
755
23.0k
            } else {
756
23.0k
                xmlOutputBufferWriteString(buf, ">");
757
23.0k
                if ((format) && (info != NULL) && (!info->isinline) &&
758
23.0k
                    (cur->children->type != HTML_TEXT_NODE) &&
759
23.0k
                    (cur->children->type != HTML_ENTITY_REF_NODE) &&
760
23.0k
                    (cur->children != cur->last) &&
761
23.0k
                    (cur->name != NULL) &&
762
23.0k
                    (cur->name[0] != 'p')) /* p, pre, param */
763
1.03k
                    xmlOutputBufferWriteString(buf, "\n");
764
23.0k
                parent = cur;
765
23.0k
                cur = cur->children;
766
23.0k
                continue;
767
23.0k
            }
768
769
20.0k
            if ((format) && (cur->next != NULL) &&
770
20.0k
                (info != NULL) && (!info->isinline)) {
771
1.51k
                if ((cur->next->type != HTML_TEXT_NODE) &&
772
1.51k
                    (cur->next->type != HTML_ENTITY_REF_NODE) &&
773
1.51k
                    (parent != NULL) &&
774
1.51k
                    (parent->name != NULL) &&
775
1.51k
                    (parent->name[0] != 'p')) /* p, pre, param */
776
614
                    xmlOutputBufferWriteString(buf, "\n");
777
1.51k
            }
778
779
20.0k
            break;
780
781
0
        case XML_ATTRIBUTE_NODE:
782
0
            htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
783
0
            break;
784
785
40.6k
        case HTML_TEXT_NODE:
786
40.6k
            if (cur->content == NULL)
787
0
                break;
788
40.6k
            if (((cur->name == (const xmlChar *)xmlStringText) ||
789
40.6k
                 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
790
40.6k
                ((parent == NULL) ||
791
40.6k
                 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
792
40.6k
                  (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
793
40.6k
                xmlChar *buffer;
794
795
40.6k
                buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
796
40.6k
                if (buffer == NULL) {
797
29
                    buf->error = XML_ERR_NO_MEMORY;
798
29
                    return;
799
29
                }
800
40.6k
                xmlOutputBufferWriteString(buf, (const char *)buffer);
801
40.6k
                xmlFree(buffer);
802
40.6k
            } else {
803
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
804
0
            }
805
40.6k
            break;
806
807
40.6k
        case HTML_COMMENT_NODE:
808
157
            if (cur->content != NULL) {
809
157
                xmlOutputBufferWriteString(buf, "<!--");
810
157
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
811
157
                xmlOutputBufferWriteString(buf, "-->");
812
157
            }
813
157
            break;
814
815
3.78k
        case HTML_PI_NODE:
816
3.78k
            if (cur->name != NULL) {
817
3.78k
                xmlOutputBufferWriteString(buf, "<?");
818
3.78k
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
819
3.78k
                if (cur->content != NULL) {
820
1.32k
                    xmlOutputBufferWriteString(buf, " ");
821
1.32k
                    xmlOutputBufferWriteString(buf,
822
1.32k
                            (const char *)cur->content);
823
1.32k
                }
824
3.78k
                xmlOutputBufferWriteString(buf, ">");
825
3.78k
            }
826
3.78k
            break;
827
828
0
        case HTML_ENTITY_REF_NODE:
829
0
            xmlOutputBufferWriteString(buf, "&");
830
0
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
831
0
            xmlOutputBufferWriteString(buf, ";");
832
0
            break;
833
834
0
        case HTML_PRESERVE_NODE:
835
0
            if (cur->content != NULL) {
836
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
837
0
            }
838
0
            break;
839
840
7
        default:
841
7
            break;
842
88.3k
        }
843
844
88.2k
        while (1) {
845
88.2k
            if (cur == root)
846
571
                return;
847
87.6k
            if (cur->next != NULL) {
848
64.0k
                cur = cur->next;
849
64.0k
                break;
850
64.0k
            }
851
852
23.5k
            cur = parent;
853
            /* cur->parent was validated when descending. */
854
23.5k
            parent = cur->parent;
855
856
23.5k
            if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
857
23.5k
                (cur->type == XML_DOCUMENT_NODE)) {
858
571
                xmlOutputBufferWriteString(buf, "\n");
859
23.0k
            } else {
860
23.0k
                if ((format) && (cur->ns == NULL))
861
22.2k
                    info = htmlTagLookup(cur->name);
862
755
                else
863
755
                    info = NULL;
864
865
23.0k
                if ((format) && (info != NULL) && (!info->isinline) &&
866
23.0k
                    (cur->last->type != HTML_TEXT_NODE) &&
867
23.0k
                    (cur->last->type != HTML_ENTITY_REF_NODE) &&
868
23.0k
                    (cur->children != cur->last) &&
869
23.0k
                    (cur->name != NULL) &&
870
23.0k
                    (cur->name[0] != 'p')) /* p, pre, param */
871
5.93k
                    xmlOutputBufferWriteString(buf, "\n");
872
873
23.0k
                xmlOutputBufferWriteString(buf, "</");
874
23.0k
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
875
663
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
876
663
                    xmlOutputBufferWriteString(buf, ":");
877
663
                }
878
23.0k
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
879
23.0k
                xmlOutputBufferWriteString(buf, ">");
880
881
23.0k
                if ((format) && (info != NULL) && (!info->isinline) &&
882
23.0k
                    (cur->next != NULL)) {
883
8.20k
                    if ((cur->next->type != HTML_TEXT_NODE) &&
884
8.20k
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
885
8.20k
                        (parent != NULL) &&
886
8.20k
                        (parent->name != NULL) &&
887
8.20k
                        (parent->name[0] != 'p')) /* p, pre, param */
888
5.88k
                        xmlOutputBufferWriteString(buf, "\n");
889
8.20k
                }
890
23.0k
            }
891
23.5k
        }
892
64.6k
    }
893
600
}
894
895
/**
896
 * htmlNodeDumpOutput:
897
 * @buf:  the HTML buffer output
898
 * @doc:  the document
899
 * @cur:  the current node
900
 * @encoding:  the encoding string (unused)
901
 *
902
 * Dump an HTML node, recursive behaviour,children are printed too,
903
 * and formatting returns/spaces are added.
904
 */
905
void
906
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
907
0
             xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
908
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
909
0
}
910
911
/**
912
 * htmlDocContentDumpFormatOutput:
913
 * @buf:  the HTML buffer output
914
 * @cur:  the document
915
 * @encoding:  the encoding string (unused)
916
 * @format:  should formatting spaces been added
917
 *
918
 * Dump an HTML document.
919
 */
920
void
921
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
922
                         const char *encoding ATTRIBUTE_UNUSED,
923
600
                               int format) {
924
600
    int type = 0;
925
600
    if (cur) {
926
600
        type = cur->type;
927
600
        cur->type = XML_HTML_DOCUMENT_NODE;
928
600
    }
929
600
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
930
600
    if (cur)
931
600
        cur->type = (xmlElementType) type;
932
600
}
933
934
/**
935
 * htmlDocContentDumpOutput:
936
 * @buf:  the HTML buffer output
937
 * @cur:  the document
938
 * @encoding:  the encoding string (unused)
939
 *
940
 * Dump an HTML document. Formatting return/spaces are added.
941
 */
942
void
943
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
944
0
                   const char *encoding ATTRIBUTE_UNUSED) {
945
0
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
946
0
}
947
948
/************************************************************************
949
 *                  *
950
 *    Saving functions front-ends       *
951
 *                  *
952
 ************************************************************************/
953
954
/**
955
 * htmlDocDump:
956
 * @f:  the FILE*
957
 * @cur:  the document
958
 *
959
 * Dump an HTML document to an open FILE.
960
 *
961
 * returns: the number of byte written or -1 in case of failure.
962
 */
963
int
964
0
htmlDocDump(FILE *f, xmlDocPtr cur) {
965
0
    xmlOutputBufferPtr buf;
966
0
    xmlCharEncodingHandlerPtr handler = NULL;
967
0
    const char *encoding;
968
0
    int ret;
969
970
0
    xmlInitParser();
971
972
0
    if ((cur == NULL) || (f == NULL)) {
973
0
  return(-1);
974
0
    }
975
976
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
977
0
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
978
0
        return(-1);
979
0
    buf = xmlOutputBufferCreateFile(f, handler);
980
0
    if (buf == NULL) {
981
0
        xmlCharEncCloseFunc(handler);
982
0
        return(-1);
983
0
    }
984
0
    htmlDocContentDumpOutput(buf, cur, NULL);
985
986
0
    ret = xmlOutputBufferClose(buf);
987
0
    return(ret);
988
0
}
989
990
/**
991
 * htmlSaveFile:
992
 * @filename:  the filename (or URL)
993
 * @cur:  the document
994
 *
995
 * Dump an HTML document to a file. If @filename is "-" the stdout file is
996
 * used.
997
 * returns: the number of byte written or -1 in case of failure.
998
 */
999
int
1000
0
htmlSaveFile(const char *filename, xmlDocPtr cur) {
1001
0
    xmlOutputBufferPtr buf;
1002
0
    xmlCharEncodingHandlerPtr handler = NULL;
1003
0
    const char *encoding;
1004
0
    int ret;
1005
1006
0
    if ((cur == NULL) || (filename == NULL))
1007
0
        return(-1);
1008
1009
0
    xmlInitParser();
1010
1011
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
1012
0
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
1013
0
        return(-1);
1014
0
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1015
0
    if (buf == NULL)
1016
0
        return(-1);
1017
1018
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1019
1020
0
    ret = xmlOutputBufferClose(buf);
1021
0
    return(ret);
1022
0
}
1023
1024
/**
1025
 * htmlSaveFileFormat:
1026
 * @filename:  the filename
1027
 * @cur:  the document
1028
 * @format:  should formatting spaces been added
1029
 * @encoding: the document encoding
1030
 *
1031
 * Dump an HTML document to a file using a given encoding.
1032
 *
1033
 * returns: the number of byte written or -1 in case of failure.
1034
 */
1035
int
1036
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1037
0
             const char *encoding, int format) {
1038
0
    xmlOutputBufferPtr buf;
1039
0
    xmlCharEncodingHandlerPtr handler = NULL;
1040
0
    int ret;
1041
1042
0
    if ((cur == NULL) || (filename == NULL))
1043
0
        return(-1);
1044
1045
0
    xmlInitParser();
1046
1047
0
    if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
1048
0
        return(-1);
1049
0
    if (handler != NULL)
1050
0
        htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1051
0
    else
1052
0
  htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1053
1054
    /*
1055
     * save the content to a temp buffer.
1056
     */
1057
0
    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1058
0
    if (buf == NULL) {
1059
0
        xmlCharEncCloseFunc(handler);
1060
0
        return(0);
1061
0
    }
1062
1063
0
    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1064
1065
0
    ret = xmlOutputBufferClose(buf);
1066
0
    return(ret);
1067
0
}
1068
1069
/**
1070
 * htmlSaveFileEnc:
1071
 * @filename:  the filename
1072
 * @cur:  the document
1073
 * @encoding: the document encoding
1074
 *
1075
 * Dump an HTML document to a file using a given encoding
1076
 * and formatting returns/spaces are added.
1077
 *
1078
 * returns: the number of byte written or -1 in case of failure.
1079
 */
1080
int
1081
0
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1082
0
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1083
0
}
1084
1085
#endif /* LIBXML_OUTPUT_ENABLED */
1086
1087
#endif /* LIBXML_HTML_ENABLED */