Coverage Report

Created: 2023-06-07 06:05

/src/libxml2-2.10.3/HTMLtree.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLtree.c : implementation of access function for an HTML tree.
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13
14
#include <string.h> /* for memset() only ! */
15
#include <ctype.h>
16
#include <stdlib.h>
17
18
#include <libxml/xmlmemory.h>
19
#include <libxml/HTMLparser.h>
20
#include <libxml/HTMLtree.h>
21
#include <libxml/entities.h>
22
#include <libxml/valid.h>
23
#include <libxml/xmlerror.h>
24
#include <libxml/parserInternals.h>
25
#include <libxml/globals.h>
26
#include <libxml/uri.h>
27
28
#include "buf.h"
29
30
/************************************************************************
31
 *                  *
32
 *    Getting/Setting encoding meta tags      *
33
 *                  *
34
 ************************************************************************/
35
36
/**
37
 * htmlGetMetaEncoding:
38
 * @doc:  the document
39
 *
40
 * Encoding definition lookup in the Meta tags
41
 *
42
 * Returns the current encoding as flagged in the HTML source
43
 */
44
const xmlChar *
45
0
htmlGetMetaEncoding(htmlDocPtr doc) {
46
0
    htmlNodePtr cur;
47
0
    const xmlChar *content;
48
0
    const xmlChar *encoding;
49
50
0
    if (doc == NULL)
51
0
  return(NULL);
52
0
    cur = doc->children;
53
54
    /*
55
     * Search the html
56
     */
57
0
    while (cur != NULL) {
58
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
59
0
      if (xmlStrEqual(cur->name, BAD_CAST"html"))
60
0
    break;
61
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
62
0
    goto found_head;
63
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
64
0
    goto found_meta;
65
0
  }
66
0
  cur = cur->next;
67
0
    }
68
0
    if (cur == NULL)
69
0
  return(NULL);
70
0
    cur = cur->children;
71
72
    /*
73
     * Search the head
74
     */
75
0
    while (cur != NULL) {
76
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
77
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
78
0
    break;
79
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
80
0
    goto found_meta;
81
0
  }
82
0
  cur = cur->next;
83
0
    }
84
0
    if (cur == NULL)
85
0
  return(NULL);
86
0
found_head:
87
0
    cur = cur->children;
88
89
    /*
90
     * Search the meta elements
91
     */
92
0
found_meta:
93
0
    while (cur != NULL) {
94
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
95
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
96
0
    xmlAttrPtr attr = cur->properties;
97
0
    int http;
98
0
    const xmlChar *value;
99
100
0
    content = NULL;
101
0
    http = 0;
102
0
    while (attr != NULL) {
103
0
        if ((attr->children != NULL) &&
104
0
            (attr->children->type == XML_TEXT_NODE) &&
105
0
            (attr->children->next == NULL)) {
106
0
      value = attr->children->content;
107
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
108
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
109
0
          http = 1;
110
0
      else if ((value != NULL)
111
0
       && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
112
0
          content = value;
113
0
      if ((http != 0) && (content != NULL))
114
0
          goto found_content;
115
0
        }
116
0
        attr = attr->next;
117
0
    }
118
0
      }
119
0
  }
120
0
  cur = cur->next;
121
0
    }
122
0
    return(NULL);
123
124
0
found_content:
125
0
    encoding = xmlStrstr(content, BAD_CAST"charset=");
126
0
    if (encoding == NULL)
127
0
  encoding = xmlStrstr(content, BAD_CAST"Charset=");
128
0
    if (encoding == NULL)
129
0
  encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
130
0
    if (encoding != NULL) {
131
0
  encoding += 8;
132
0
    } else {
133
0
  encoding = xmlStrstr(content, BAD_CAST"charset =");
134
0
  if (encoding == NULL)
135
0
      encoding = xmlStrstr(content, BAD_CAST"Charset =");
136
0
  if (encoding == NULL)
137
0
      encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
138
0
  if (encoding != NULL)
139
0
      encoding += 9;
140
0
    }
141
0
    if (encoding != NULL) {
142
0
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
143
0
    }
144
0
    return(encoding);
145
0
}
146
147
/**
148
 * htmlSetMetaEncoding:
149
 * @doc:  the document
150
 * @encoding:  the encoding string
151
 *
152
 * Sets the current encoding in the Meta tags
153
 * NOTE: this will not change the document content encoding, just
154
 * the META flag associated.
155
 *
156
 * Returns 0 in case of success and -1 in case of error
157
 */
158
int
159
0
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
160
0
    htmlNodePtr cur, meta = NULL, head = NULL;
161
0
    const xmlChar *content = NULL;
162
0
    char newcontent[100];
163
164
0
    newcontent[0] = 0;
165
166
0
    if (doc == NULL)
167
0
  return(-1);
168
169
    /* html isn't a real encoding it's just libxml2 way to get entities */
170
0
    if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
171
0
        return(-1);
172
173
0
    if (encoding != NULL) {
174
0
  snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
175
0
                (char *)encoding);
176
0
  newcontent[sizeof(newcontent) - 1] = 0;
177
0
    }
178
179
0
    cur = doc->children;
180
181
    /*
182
     * Search the html
183
     */
184
0
    while (cur != NULL) {
185
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
186
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
187
0
    break;
188
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
189
0
    goto found_head;
190
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
191
0
    goto found_meta;
192
0
  }
193
0
  cur = cur->next;
194
0
    }
195
0
    if (cur == NULL)
196
0
  return(-1);
197
0
    cur = cur->children;
198
199
    /*
200
     * Search the head
201
     */
202
0
    while (cur != NULL) {
203
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
204
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
205
0
    break;
206
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
207
0
                head = cur->parent;
208
0
    goto found_meta;
209
0
            }
210
0
  }
211
0
  cur = cur->next;
212
0
    }
213
0
    if (cur == NULL)
214
0
  return(-1);
215
0
found_head:
216
0
    head = cur;
217
0
    if (cur->children == NULL)
218
0
        goto create;
219
0
    cur = cur->children;
220
221
0
found_meta:
222
    /*
223
     * Search and update all the remaining the meta elements carrying
224
     * encoding information
225
     */
226
0
    while (cur != NULL) {
227
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
228
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
229
0
    xmlAttrPtr attr = cur->properties;
230
0
    int http;
231
0
    const xmlChar *value;
232
233
0
    content = NULL;
234
0
    http = 0;
235
0
    while (attr != NULL) {
236
0
        if ((attr->children != NULL) &&
237
0
            (attr->children->type == XML_TEXT_NODE) &&
238
0
            (attr->children->next == NULL)) {
239
0
      value = attr->children->content;
240
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
241
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
242
0
          http = 1;
243
0
      else
244
0
                        {
245
0
                           if ((value != NULL) &&
246
0
                               (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
247
0
             content = value;
248
0
                        }
249
0
            if ((http != 0) && (content != NULL))
250
0
          break;
251
0
        }
252
0
        attr = attr->next;
253
0
    }
254
0
    if ((http != 0) && (content != NULL)) {
255
0
        meta = cur;
256
0
        break;
257
0
    }
258
259
0
      }
260
0
  }
261
0
  cur = cur->next;
262
0
    }
263
0
create:
264
0
    if (meta == NULL) {
265
0
        if ((encoding != NULL) && (head != NULL)) {
266
            /*
267
             * Create a new Meta element with the right attributes
268
             */
269
270
0
            meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
271
0
            if (head->children == NULL)
272
0
                xmlAddChild(head, meta);
273
0
            else
274
0
                xmlAddPrevSibling(head->children, meta);
275
0
            xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
276
0
            xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
277
0
        }
278
0
    } else {
279
        /* remove the meta tag if NULL is passed */
280
0
        if (encoding == NULL) {
281
0
            xmlUnlinkNode(meta);
282
0
            xmlFreeNode(meta);
283
0
        }
284
        /* change the document only if there is a real encoding change */
285
0
        else if (xmlStrcasestr(content, encoding) == NULL) {
286
0
            xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
287
0
        }
288
0
    }
289
290
291
0
    return(0);
292
0
}
293
294
/**
295
 * booleanHTMLAttrs:
296
 *
297
 * These are the HTML attributes which will be output
298
 * in minimized form, i.e. <option selected="selected"> will be
299
 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
300
 *
301
 */
302
static const char* const htmlBooleanAttrs[] = {
303
  "checked", "compact", "declare", "defer", "disabled", "ismap",
304
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
305
  "selected", NULL
306
};
307
308
309
/**
310
 * htmlIsBooleanAttr:
311
 * @name:  the name of the attribute to check
312
 *
313
 * Determine if a given attribute is a boolean attribute.
314
 *
315
 * returns: false if the attribute is not boolean, true otherwise.
316
 */
317
int
318
htmlIsBooleanAttr(const xmlChar *name)
319
0
{
320
0
    int i = 0;
321
322
0
    while (htmlBooleanAttrs[i] != NULL) {
323
0
        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
324
0
            return 1;
325
0
        i++;
326
0
    }
327
0
    return 0;
328
0
}
329
330
#ifdef LIBXML_OUTPUT_ENABLED
331
/*
332
 * private routine exported from xmlIO.c
333
 */
334
xmlOutputBufferPtr
335
xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
336
/************************************************************************
337
 *                  *
338
 *      Output error handlers       *
339
 *                  *
340
 ************************************************************************/
341
/**
342
 * htmlSaveErrMemory:
343
 * @extra:  extra information
344
 *
345
 * Handle an out of memory condition
346
 */
347
static void
348
htmlSaveErrMemory(const char *extra)
349
0
{
350
0
    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
351
0
}
352
353
/**
354
 * htmlSaveErr:
355
 * @code:  the error number
356
 * @node:  the location of the error.
357
 * @extra:  extra information
358
 *
359
 * Handle an out of memory condition
360
 */
361
static void
362
htmlSaveErr(int code, xmlNodePtr node, const char *extra)
363
0
{
364
0
    const char *msg = NULL;
365
366
0
    switch(code) {
367
0
        case XML_SAVE_NOT_UTF8:
368
0
      msg = "string is not in UTF-8\n";
369
0
      break;
370
0
  case XML_SAVE_CHAR_INVALID:
371
0
      msg = "invalid character value\n";
372
0
      break;
373
0
  case XML_SAVE_UNKNOWN_ENCODING:
374
0
      msg = "unknown encoding %s\n";
375
0
      break;
376
0
  case XML_SAVE_NO_DOCTYPE:
377
0
      msg = "HTML has no DOCTYPE\n";
378
0
      break;
379
0
  default:
380
0
      msg = "unexpected error number\n";
381
0
    }
382
0
    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
383
0
}
384
385
/************************************************************************
386
 *                  *
387
 *    Dumping HTML tree content to a simple buffer    *
388
 *                  *
389
 ************************************************************************/
390
391
/**
392
 * htmlBufNodeDumpFormat:
393
 * @buf:  the xmlBufPtr output
394
 * @doc:  the document
395
 * @cur:  the current node
396
 * @format:  should formatting spaces been added
397
 *
398
 * Dump an HTML node, recursive behaviour,children are printed too.
399
 *
400
 * Returns the number of byte written or -1 in case of error
401
 */
402
static size_t
403
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
404
0
             int format) {
405
0
    size_t use;
406
0
    int ret;
407
0
    xmlOutputBufferPtr outbuf;
408
409
0
    if (cur == NULL) {
410
0
  return (-1);
411
0
    }
412
0
    if (buf == NULL) {
413
0
  return (-1);
414
0
    }
415
0
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
416
0
    if (outbuf == NULL) {
417
0
        htmlSaveErrMemory("allocating HTML output buffer");
418
0
  return (-1);
419
0
    }
420
0
    memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
421
0
    outbuf->buffer = buf;
422
0
    outbuf->encoder = NULL;
423
0
    outbuf->writecallback = NULL;
424
0
    outbuf->closecallback = NULL;
425
0
    outbuf->context = NULL;
426
0
    outbuf->written = 0;
427
428
0
    use = xmlBufUse(buf);
429
0
    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
430
0
    xmlFree(outbuf);
431
0
    ret = xmlBufUse(buf) - use;
432
0
    return (ret);
433
0
}
434
435
/**
436
 * htmlNodeDump:
437
 * @buf:  the HTML buffer output
438
 * @doc:  the document
439
 * @cur:  the current node
440
 *
441
 * Dump an HTML node, recursive behaviour,children are printed too,
442
 * and formatting returns are added.
443
 *
444
 * Returns the number of byte written or -1 in case of error
445
 */
446
int
447
0
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
448
0
    xmlBufPtr buffer;
449
0
    size_t ret;
450
451
0
    if ((buf == NULL) || (cur == NULL))
452
0
        return(-1);
453
454
0
    xmlInitParser();
455
0
    buffer = xmlBufFromBuffer(buf);
456
0
    if (buffer == NULL)
457
0
        return(-1);
458
459
0
    ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
460
461
0
    xmlBufBackToBuffer(buffer);
462
463
0
    if (ret > INT_MAX)
464
0
        return(-1);
465
0
    return((int) ret);
466
0
}
467
468
/**
469
 * htmlNodeDumpFileFormat:
470
 * @out:  the FILE pointer
471
 * @doc:  the document
472
 * @cur:  the current node
473
 * @encoding: the document encoding
474
 * @format:  should formatting spaces been added
475
 *
476
 * Dump an HTML node, recursive behaviour,children are printed too.
477
 *
478
 * TODO: if encoding == NULL try to save in the doc encoding
479
 *
480
 * returns: the number of byte written or -1 in case of failure.
481
 */
482
int
483
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
484
0
                 xmlNodePtr cur, const char *encoding, int format) {
485
0
    xmlOutputBufferPtr buf;
486
0
    xmlCharEncodingHandlerPtr handler = NULL;
487
0
    int ret;
488
489
0
    xmlInitParser();
490
491
0
    if (encoding != NULL) {
492
0
  xmlCharEncoding enc;
493
494
0
  enc = xmlParseCharEncoding(encoding);
495
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
496
0
      handler = xmlFindCharEncodingHandler(encoding);
497
0
      if (handler == NULL)
498
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
499
0
  }
500
0
    } else {
501
        /*
502
         * Fallback to HTML or ASCII when the encoding is unspecified
503
         */
504
0
        if (handler == NULL)
505
0
            handler = xmlFindCharEncodingHandler("HTML");
506
0
        if (handler == NULL)
507
0
            handler = xmlFindCharEncodingHandler("ascii");
508
0
    }
509
510
    /*
511
     * save the content to a temp buffer.
512
     */
513
0
    buf = xmlOutputBufferCreateFile(out, handler);
514
0
    if (buf == NULL) return(0);
515
516
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
517
518
0
    ret = xmlOutputBufferClose(buf);
519
0
    return(ret);
520
0
}
521
522
/**
523
 * htmlNodeDumpFile:
524
 * @out:  the FILE pointer
525
 * @doc:  the document
526
 * @cur:  the current node
527
 *
528
 * Dump an HTML node, recursive behaviour,children are printed too,
529
 * and formatting returns are added.
530
 */
531
void
532
0
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
533
0
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
534
0
}
535
536
/**
537
 * htmlDocDumpMemoryFormat:
538
 * @cur:  the document
539
 * @mem:  OUT: the memory pointer
540
 * @size:  OUT: the memory length
541
 * @format:  should formatting spaces been added
542
 *
543
 * Dump an HTML document in memory and return the xmlChar * and it's size.
544
 * It's up to the caller to free the memory.
545
 */
546
void
547
0
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
548
0
    xmlOutputBufferPtr buf;
549
0
    xmlCharEncodingHandlerPtr handler = NULL;
550
0
    const char *encoding;
551
552
0
    xmlInitParser();
553
554
0
    if ((mem == NULL) || (size == NULL))
555
0
        return;
556
0
    if (cur == NULL) {
557
0
  *mem = NULL;
558
0
  *size = 0;
559
0
  return;
560
0
    }
561
562
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
563
564
0
    if (encoding != NULL) {
565
0
  xmlCharEncoding enc;
566
567
0
  enc = xmlParseCharEncoding(encoding);
568
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
569
0
      handler = xmlFindCharEncodingHandler(encoding);
570
0
      if (handler == NULL)
571
0
                htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
572
573
0
  }
574
0
    } else {
575
        /*
576
         * Fallback to HTML or ASCII when the encoding is unspecified
577
         */
578
0
        if (handler == NULL)
579
0
            handler = xmlFindCharEncodingHandler("HTML");
580
0
        if (handler == NULL)
581
0
            handler = xmlFindCharEncodingHandler("ascii");
582
0
    }
583
584
0
    buf = xmlAllocOutputBufferInternal(handler);
585
0
    if (buf == NULL) {
586
0
  *mem = NULL;
587
0
  *size = 0;
588
0
  return;
589
0
    }
590
591
0
    htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
592
593
0
    xmlOutputBufferFlush(buf);
594
0
    if (buf->conv != NULL) {
595
0
  *size = xmlBufUse(buf->conv);
596
0
  *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
597
0
    } else {
598
0
  *size = xmlBufUse(buf->buffer);
599
0
  *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
600
0
    }
601
0
    (void)xmlOutputBufferClose(buf);
602
0
}
603
604
/**
605
 * htmlDocDumpMemory:
606
 * @cur:  the document
607
 * @mem:  OUT: the memory pointer
608
 * @size:  OUT: the memory length
609
 *
610
 * Dump an HTML document in memory and return the xmlChar * and it's size.
611
 * It's up to the caller to free the memory.
612
 */
613
void
614
0
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
615
0
  htmlDocDumpMemoryFormat(cur, mem, size, 1);
616
0
}
617
618
619
/************************************************************************
620
 *                  *
621
 *    Dumping HTML tree content to an I/O output buffer *
622
 *                  *
623
 ************************************************************************/
624
625
void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
626
627
/**
628
 * htmlDtdDumpOutput:
629
 * @buf:  the HTML buffer output
630
 * @doc:  the document
631
 * @encoding:  the encoding string
632
 *
633
 * TODO: check whether encoding is needed
634
 *
635
 * Dump the HTML document DTD, if any.
636
 */
637
static void
638
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
639
0
            const char *encoding ATTRIBUTE_UNUSED) {
640
0
    xmlDtdPtr cur = doc->intSubset;
641
642
0
    if (cur == NULL) {
643
0
  htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
644
0
  return;
645
0
    }
646
0
    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
647
0
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
648
0
    if (cur->ExternalID != NULL) {
649
0
  xmlOutputBufferWriteString(buf, " PUBLIC ");
650
0
  xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
651
0
  if (cur->SystemID != NULL) {
652
0
      xmlOutputBufferWriteString(buf, " ");
653
0
      xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
654
0
  }
655
0
    } else if (cur->SystemID != NULL &&
656
0
         xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
657
0
  xmlOutputBufferWriteString(buf, " SYSTEM ");
658
0
  xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
659
0
    }
660
0
    xmlOutputBufferWriteString(buf, ">\n");
661
0
}
662
663
/**
664
 * htmlAttrDumpOutput:
665
 * @buf:  the HTML buffer output
666
 * @doc:  the document
667
 * @cur:  the attribute pointer
668
 *
669
 * Dump an HTML attribute
670
 */
671
static void
672
0
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
673
0
    xmlChar *value;
674
675
    /*
676
     * The html output method should not escape a & character
677
     * occurring in an attribute value immediately followed by
678
     * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
679
     * This is implemented in xmlEncodeEntitiesReentrant
680
     */
681
682
0
    if (cur == NULL) {
683
0
  return;
684
0
    }
685
0
    xmlOutputBufferWriteString(buf, " ");
686
0
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
687
0
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
688
0
  xmlOutputBufferWriteString(buf, ":");
689
0
    }
690
0
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
691
0
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
692
0
  value = xmlNodeListGetString(doc, cur->children, 0);
693
0
  if (value) {
694
0
      xmlOutputBufferWriteString(buf, "=");
695
0
      if ((cur->ns == NULL) && (cur->parent != NULL) &&
696
0
    (cur->parent->ns == NULL) &&
697
0
    ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
698
0
           (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
699
0
     (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
700
0
     ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
701
0
      (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
702
0
    xmlChar *escaped;
703
0
    xmlChar *tmp = value;
704
705
0
    while (IS_BLANK_CH(*tmp)) tmp++;
706
707
    /*
708
     * the < and > have already been escaped at the entity level
709
     * And doing so here breaks server side includes
710
     */
711
0
    escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
712
0
    if (escaped != NULL) {
713
0
        xmlBufWriteQuotedString(buf->buffer, escaped);
714
0
        xmlFree(escaped);
715
0
    } else {
716
0
        xmlBufWriteQuotedString(buf->buffer, value);
717
0
    }
718
0
      } else {
719
0
    xmlBufWriteQuotedString(buf->buffer, value);
720
0
      }
721
0
      xmlFree(value);
722
0
  } else  {
723
0
      xmlOutputBufferWriteString(buf, "=\"\"");
724
0
  }
725
0
    }
726
0
}
727
728
/**
729
 * htmlNodeDumpFormatOutput:
730
 * @buf:  the HTML buffer output
731
 * @doc:  the document
732
 * @cur:  the current node
733
 * @encoding:  the encoding string (unused)
734
 * @format:  should formatting spaces been added
735
 *
736
 * Dump an HTML node, recursive behaviour,children are printed too.
737
 */
738
void
739
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
740
                   xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
741
0
                         int format) {
742
0
    xmlNodePtr root, parent;
743
0
    xmlAttrPtr attr;
744
0
    const htmlElemDesc * info;
745
746
0
    xmlInitParser();
747
748
0
    if ((cur == NULL) || (buf == NULL)) {
749
0
  return;
750
0
    }
751
752
0
    root = cur;
753
0
    parent = cur->parent;
754
0
    while (1) {
755
0
        switch (cur->type) {
756
0
        case XML_HTML_DOCUMENT_NODE:
757
0
        case XML_DOCUMENT_NODE:
758
0
            if (((xmlDocPtr) cur)->intSubset != NULL) {
759
0
                htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
760
0
            }
761
0
            if (cur->children != NULL) {
762
                /* Always validate cur->parent when descending. */
763
0
                if (cur->parent == parent) {
764
0
                    parent = cur;
765
0
                    cur = cur->children;
766
0
                    continue;
767
0
                }
768
0
            } else {
769
0
                xmlOutputBufferWriteString(buf, "\n");
770
0
            }
771
0
            break;
772
773
0
        case XML_ELEMENT_NODE:
774
            /*
775
             * Some users like lxml are known to pass nodes with a corrupted
776
             * tree structure. Fall back to a recursive call to handle this
777
             * case.
778
             */
779
0
            if ((cur->parent != parent) && (cur->children != NULL)) {
780
0
                htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
781
0
                break;
782
0
            }
783
784
            /*
785
             * Get specific HTML info for that node.
786
             */
787
0
            if (cur->ns == NULL)
788
0
                info = htmlTagLookup(cur->name);
789
0
            else
790
0
                info = NULL;
791
792
0
            xmlOutputBufferWriteString(buf, "<");
793
0
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
794
0
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
795
0
                xmlOutputBufferWriteString(buf, ":");
796
0
            }
797
0
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
798
0
            if (cur->nsDef)
799
0
                xmlNsListDumpOutput(buf, cur->nsDef);
800
0
            attr = cur->properties;
801
0
            while (attr != NULL) {
802
0
                htmlAttrDumpOutput(buf, doc, attr);
803
0
                attr = attr->next;
804
0
            }
805
806
0
            if ((info != NULL) && (info->empty)) {
807
0
                xmlOutputBufferWriteString(buf, ">");
808
0
            } else if (cur->children == NULL) {
809
0
                if ((info != NULL) && (info->saveEndTag != 0) &&
810
0
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
811
0
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
812
0
                    xmlOutputBufferWriteString(buf, ">");
813
0
                } else {
814
0
                    xmlOutputBufferWriteString(buf, "></");
815
0
                    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
816
0
                        xmlOutputBufferWriteString(buf,
817
0
                                (const char *)cur->ns->prefix);
818
0
                        xmlOutputBufferWriteString(buf, ":");
819
0
                    }
820
0
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
821
0
                    xmlOutputBufferWriteString(buf, ">");
822
0
                }
823
0
            } else {
824
0
                xmlOutputBufferWriteString(buf, ">");
825
0
                if ((format) && (info != NULL) && (!info->isinline) &&
826
0
                    (cur->children->type != HTML_TEXT_NODE) &&
827
0
                    (cur->children->type != HTML_ENTITY_REF_NODE) &&
828
0
                    (cur->children != cur->last) &&
829
0
                    (cur->name != NULL) &&
830
0
                    (cur->name[0] != 'p')) /* p, pre, param */
831
0
                    xmlOutputBufferWriteString(buf, "\n");
832
0
                parent = cur;
833
0
                cur = cur->children;
834
0
                continue;
835
0
            }
836
837
0
            if ((format) && (cur->next != NULL) &&
838
0
                (info != NULL) && (!info->isinline)) {
839
0
                if ((cur->next->type != HTML_TEXT_NODE) &&
840
0
                    (cur->next->type != HTML_ENTITY_REF_NODE) &&
841
0
                    (parent != NULL) &&
842
0
                    (parent->name != NULL) &&
843
0
                    (parent->name[0] != 'p')) /* p, pre, param */
844
0
                    xmlOutputBufferWriteString(buf, "\n");
845
0
            }
846
847
0
            break;
848
849
0
        case XML_ATTRIBUTE_NODE:
850
0
            htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
851
0
            break;
852
853
0
        case HTML_TEXT_NODE:
854
0
            if (cur->content == NULL)
855
0
                break;
856
0
            if (((cur->name == (const xmlChar *)xmlStringText) ||
857
0
                 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
858
0
                ((parent == NULL) ||
859
0
                 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
860
0
                  (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
861
0
                xmlChar *buffer;
862
863
0
                buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
864
0
                if (buffer != NULL) {
865
0
                    xmlOutputBufferWriteString(buf, (const char *)buffer);
866
0
                    xmlFree(buffer);
867
0
                }
868
0
            } else {
869
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
870
0
            }
871
0
            break;
872
873
0
        case HTML_COMMENT_NODE:
874
0
            if (cur->content != NULL) {
875
0
                xmlOutputBufferWriteString(buf, "<!--");
876
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
877
0
                xmlOutputBufferWriteString(buf, "-->");
878
0
            }
879
0
            break;
880
881
0
        case HTML_PI_NODE:
882
0
            if (cur->name != NULL) {
883
0
                xmlOutputBufferWriteString(buf, "<?");
884
0
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
885
0
                if (cur->content != NULL) {
886
0
                    xmlOutputBufferWriteString(buf, " ");
887
0
                    xmlOutputBufferWriteString(buf,
888
0
                            (const char *)cur->content);
889
0
                }
890
0
                xmlOutputBufferWriteString(buf, ">");
891
0
            }
892
0
            break;
893
894
0
        case HTML_ENTITY_REF_NODE:
895
0
            xmlOutputBufferWriteString(buf, "&");
896
0
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
897
0
            xmlOutputBufferWriteString(buf, ";");
898
0
            break;
899
900
0
        case HTML_PRESERVE_NODE:
901
0
            if (cur->content != NULL) {
902
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
903
0
            }
904
0
            break;
905
906
0
        default:
907
0
            break;
908
0
        }
909
910
0
        while (1) {
911
0
            if (cur == root)
912
0
                return;
913
0
            if (cur->next != NULL) {
914
0
                cur = cur->next;
915
0
                break;
916
0
            }
917
918
0
            cur = parent;
919
            /* cur->parent was validated when descending. */
920
0
            parent = cur->parent;
921
922
0
            if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
923
0
                (cur->type == XML_DOCUMENT_NODE)) {
924
0
                xmlOutputBufferWriteString(buf, "\n");
925
0
            } else {
926
0
                if ((format) && (cur->ns == NULL))
927
0
                    info = htmlTagLookup(cur->name);
928
0
                else
929
0
                    info = NULL;
930
931
0
                if ((format) && (info != NULL) && (!info->isinline) &&
932
0
                    (cur->last->type != HTML_TEXT_NODE) &&
933
0
                    (cur->last->type != HTML_ENTITY_REF_NODE) &&
934
0
                    (cur->children != cur->last) &&
935
0
                    (cur->name != NULL) &&
936
0
                    (cur->name[0] != 'p')) /* p, pre, param */
937
0
                    xmlOutputBufferWriteString(buf, "\n");
938
939
0
                xmlOutputBufferWriteString(buf, "</");
940
0
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
941
0
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
942
0
                    xmlOutputBufferWriteString(buf, ":");
943
0
                }
944
0
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
945
0
                xmlOutputBufferWriteString(buf, ">");
946
947
0
                if ((format) && (info != NULL) && (!info->isinline) &&
948
0
                    (cur->next != NULL)) {
949
0
                    if ((cur->next->type != HTML_TEXT_NODE) &&
950
0
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
951
0
                        (parent != NULL) &&
952
0
                        (parent->name != NULL) &&
953
0
                        (parent->name[0] != 'p')) /* p, pre, param */
954
0
                        xmlOutputBufferWriteString(buf, "\n");
955
0
                }
956
0
            }
957
0
        }
958
0
    }
959
0
}
960
961
/**
962
 * htmlNodeDumpOutput:
963
 * @buf:  the HTML buffer output
964
 * @doc:  the document
965
 * @cur:  the current node
966
 * @encoding:  the encoding string (unused)
967
 *
968
 * Dump an HTML node, recursive behaviour,children are printed too,
969
 * and formatting returns/spaces are added.
970
 */
971
void
972
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
973
0
             xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
974
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
975
0
}
976
977
/**
978
 * htmlDocContentDumpFormatOutput:
979
 * @buf:  the HTML buffer output
980
 * @cur:  the document
981
 * @encoding:  the encoding string (unused)
982
 * @format:  should formatting spaces been added
983
 *
984
 * Dump an HTML document.
985
 */
986
void
987
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
988
                         const char *encoding ATTRIBUTE_UNUSED,
989
0
                               int format) {
990
0
    int type = 0;
991
0
    if (cur) {
992
0
        type = cur->type;
993
0
        cur->type = XML_HTML_DOCUMENT_NODE;
994
0
    }
995
0
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
996
0
    if (cur)
997
0
        cur->type = (xmlElementType) type;
998
0
}
999
1000
/**
1001
 * htmlDocContentDumpOutput:
1002
 * @buf:  the HTML buffer output
1003
 * @cur:  the document
1004
 * @encoding:  the encoding string (unused)
1005
 *
1006
 * Dump an HTML document. Formatting return/spaces are added.
1007
 */
1008
void
1009
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1010
0
                   const char *encoding ATTRIBUTE_UNUSED) {
1011
0
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1012
0
}
1013
1014
/************************************************************************
1015
 *                  *
1016
 *    Saving functions front-ends       *
1017
 *                  *
1018
 ************************************************************************/
1019
1020
/**
1021
 * htmlDocDump:
1022
 * @f:  the FILE*
1023
 * @cur:  the document
1024
 *
1025
 * Dump an HTML document to an open FILE.
1026
 *
1027
 * returns: the number of byte written or -1 in case of failure.
1028
 */
1029
int
1030
0
htmlDocDump(FILE *f, xmlDocPtr cur) {
1031
0
    xmlOutputBufferPtr buf;
1032
0
    xmlCharEncodingHandlerPtr handler = NULL;
1033
0
    const char *encoding;
1034
0
    int ret;
1035
1036
0
    xmlInitParser();
1037
1038
0
    if ((cur == NULL) || (f == NULL)) {
1039
0
  return(-1);
1040
0
    }
1041
1042
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
1043
1044
0
    if (encoding != NULL) {
1045
0
  xmlCharEncoding enc;
1046
1047
0
  enc = xmlParseCharEncoding(encoding);
1048
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1049
0
      handler = xmlFindCharEncodingHandler(encoding);
1050
0
      if (handler == NULL)
1051
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1052
0
  }
1053
0
    } else {
1054
        /*
1055
         * Fallback to HTML or ASCII when the encoding is unspecified
1056
         */
1057
0
        if (handler == NULL)
1058
0
            handler = xmlFindCharEncodingHandler("HTML");
1059
0
        if (handler == NULL)
1060
0
            handler = xmlFindCharEncodingHandler("ascii");
1061
0
    }
1062
1063
0
    buf = xmlOutputBufferCreateFile(f, handler);
1064
0
    if (buf == NULL) return(-1);
1065
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1066
1067
0
    ret = xmlOutputBufferClose(buf);
1068
0
    return(ret);
1069
0
}
1070
1071
/**
1072
 * htmlSaveFile:
1073
 * @filename:  the filename (or URL)
1074
 * @cur:  the document
1075
 *
1076
 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1077
 * used.
1078
 * returns: the number of byte written or -1 in case of failure.
1079
 */
1080
int
1081
0
htmlSaveFile(const char *filename, xmlDocPtr cur) {
1082
0
    xmlOutputBufferPtr buf;
1083
0
    xmlCharEncodingHandlerPtr handler = NULL;
1084
0
    const char *encoding;
1085
0
    int ret;
1086
1087
0
    if ((cur == NULL) || (filename == NULL))
1088
0
        return(-1);
1089
1090
0
    xmlInitParser();
1091
1092
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
1093
1094
0
    if (encoding != NULL) {
1095
0
  xmlCharEncoding enc;
1096
1097
0
  enc = xmlParseCharEncoding(encoding);
1098
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1099
0
      handler = xmlFindCharEncodingHandler(encoding);
1100
0
      if (handler == NULL)
1101
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1102
0
  }
1103
0
    } else {
1104
        /*
1105
         * Fallback to HTML or ASCII when the encoding is unspecified
1106
         */
1107
0
        if (handler == NULL)
1108
0
            handler = xmlFindCharEncodingHandler("HTML");
1109
0
        if (handler == NULL)
1110
0
            handler = xmlFindCharEncodingHandler("ascii");
1111
0
    }
1112
1113
    /*
1114
     * save the content to a temp buffer.
1115
     */
1116
0
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1117
0
    if (buf == NULL) return(0);
1118
1119
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1120
1121
0
    ret = xmlOutputBufferClose(buf);
1122
0
    return(ret);
1123
0
}
1124
1125
/**
1126
 * htmlSaveFileFormat:
1127
 * @filename:  the filename
1128
 * @cur:  the document
1129
 * @format:  should formatting spaces been added
1130
 * @encoding: the document encoding
1131
 *
1132
 * Dump an HTML document to a file using a given encoding.
1133
 *
1134
 * returns: the number of byte written or -1 in case of failure.
1135
 */
1136
int
1137
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1138
0
             const char *encoding, int format) {
1139
0
    xmlOutputBufferPtr buf;
1140
0
    xmlCharEncodingHandlerPtr handler = NULL;
1141
0
    int ret;
1142
1143
0
    if ((cur == NULL) || (filename == NULL))
1144
0
        return(-1);
1145
1146
0
    xmlInitParser();
1147
1148
0
    if (encoding != NULL) {
1149
0
  xmlCharEncoding enc;
1150
1151
0
  enc = xmlParseCharEncoding(encoding);
1152
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1153
0
      handler = xmlFindCharEncodingHandler(encoding);
1154
0
      if (handler == NULL)
1155
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1156
0
  }
1157
0
        htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1158
0
    } else {
1159
0
  htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1160
1161
        /*
1162
         * Fallback to HTML or ASCII when the encoding is unspecified
1163
         */
1164
0
        if (handler == NULL)
1165
0
            handler = xmlFindCharEncodingHandler("HTML");
1166
0
        if (handler == NULL)
1167
0
            handler = xmlFindCharEncodingHandler("ascii");
1168
0
    }
1169
1170
    /*
1171
     * save the content to a temp buffer.
1172
     */
1173
0
    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1174
0
    if (buf == NULL) return(0);
1175
1176
0
    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1177
1178
0
    ret = xmlOutputBufferClose(buf);
1179
0
    return(ret);
1180
0
}
1181
1182
/**
1183
 * htmlSaveFileEnc:
1184
 * @filename:  the filename
1185
 * @cur:  the document
1186
 * @encoding: the document encoding
1187
 *
1188
 * Dump an HTML document to a file using a given encoding
1189
 * and formatting returns/spaces are added.
1190
 *
1191
 * returns: the number of byte written or -1 in case of failure.
1192
 */
1193
int
1194
0
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1195
0
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1196
0
}
1197
1198
#endif /* LIBXML_OUTPUT_ENABLED */
1199
1200
#endif /* LIBXML_HTML_ENABLED */