Coverage Report

Created: 2023-06-07 06:14

/src/libxml2/HTMLtree.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLtree.c : implementation of access function for an HTML tree.
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13
14
#include <string.h> /* for memset() only ! */
15
#include <ctype.h>
16
#include <stdlib.h>
17
18
#include <libxml/xmlmemory.h>
19
#include <libxml/HTMLparser.h>
20
#include <libxml/HTMLtree.h>
21
#include <libxml/entities.h>
22
#include <libxml/valid.h>
23
#include <libxml/xmlerror.h>
24
#include <libxml/parserInternals.h>
25
#include <libxml/globals.h>
26
#include <libxml/uri.h>
27
28
#include "private/buf.h"
29
#include "private/error.h"
30
#include "private/io.h"
31
#include "private/save.h"
32
33
/************************************************************************
34
 *                  *
35
 *    Getting/Setting encoding meta tags      *
36
 *                  *
37
 ************************************************************************/
38
39
/**
40
 * htmlGetMetaEncoding:
41
 * @doc:  the document
42
 *
43
 * Encoding definition lookup in the Meta tags
44
 *
45
 * Returns the current encoding as flagged in the HTML source
46
 */
47
const xmlChar *
48
0
htmlGetMetaEncoding(htmlDocPtr doc) {
49
0
    htmlNodePtr cur;
50
0
    const xmlChar *content;
51
0
    const xmlChar *encoding;
52
53
0
    if (doc == NULL)
54
0
  return(NULL);
55
0
    cur = doc->children;
56
57
    /*
58
     * Search the html
59
     */
60
0
    while (cur != NULL) {
61
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62
0
      if (xmlStrEqual(cur->name, BAD_CAST"html"))
63
0
    break;
64
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
65
0
    goto found_head;
66
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67
0
    goto found_meta;
68
0
  }
69
0
  cur = cur->next;
70
0
    }
71
0
    if (cur == NULL)
72
0
  return(NULL);
73
0
    cur = cur->children;
74
75
    /*
76
     * Search the head
77
     */
78
0
    while (cur != NULL) {
79
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
81
0
    break;
82
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83
0
    goto found_meta;
84
0
  }
85
0
  cur = cur->next;
86
0
    }
87
0
    if (cur == NULL)
88
0
  return(NULL);
89
0
found_head:
90
0
    cur = cur->children;
91
92
    /*
93
     * Search the meta elements
94
     */
95
0
found_meta:
96
0
    while (cur != NULL) {
97
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99
0
    xmlAttrPtr attr = cur->properties;
100
0
    int http;
101
0
    const xmlChar *value;
102
103
0
    content = NULL;
104
0
    http = 0;
105
0
    while (attr != NULL) {
106
0
        if ((attr->children != NULL) &&
107
0
            (attr->children->type == XML_TEXT_NODE) &&
108
0
            (attr->children->next == NULL)) {
109
0
      value = attr->children->content;
110
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112
0
          http = 1;
113
0
      else if ((value != NULL)
114
0
       && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115
0
          content = value;
116
0
      if ((http != 0) && (content != NULL))
117
0
          goto found_content;
118
0
        }
119
0
        attr = attr->next;
120
0
    }
121
0
      }
122
0
  }
123
0
  cur = cur->next;
124
0
    }
125
0
    return(NULL);
126
127
0
found_content:
128
0
    encoding = xmlStrstr(content, BAD_CAST"charset=");
129
0
    if (encoding == NULL)
130
0
  encoding = xmlStrstr(content, BAD_CAST"Charset=");
131
0
    if (encoding == NULL)
132
0
  encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133
0
    if (encoding != NULL) {
134
0
  encoding += 8;
135
0
    } else {
136
0
  encoding = xmlStrstr(content, BAD_CAST"charset =");
137
0
  if (encoding == NULL)
138
0
      encoding = xmlStrstr(content, BAD_CAST"Charset =");
139
0
  if (encoding == NULL)
140
0
      encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141
0
  if (encoding != NULL)
142
0
      encoding += 9;
143
0
    }
144
0
    if (encoding != NULL) {
145
0
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146
0
    }
147
0
    return(encoding);
148
0
}
149
150
/**
151
 * htmlSetMetaEncoding:
152
 * @doc:  the document
153
 * @encoding:  the encoding string
154
 *
155
 * Sets the current encoding in the Meta tags
156
 * NOTE: this will not change the document content encoding, just
157
 * the META flag associated.
158
 *
159
 * Returns 0 in case of success and -1 in case of error
160
 */
161
int
162
535
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163
535
    htmlNodePtr cur, meta = NULL, head = NULL;
164
535
    const xmlChar *content = NULL;
165
535
    char newcontent[100];
166
167
535
    newcontent[0] = 0;
168
169
535
    if (doc == NULL)
170
0
  return(-1);
171
172
    /* html isn't a real encoding it's just libxml2 way to get entities */
173
535
    if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
174
0
        return(-1);
175
176
535
    if (encoding != NULL) {
177
535
  snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
178
535
                (char *)encoding);
179
535
  newcontent[sizeof(newcontent) - 1] = 0;
180
535
    }
181
182
535
    cur = doc->children;
183
184
    /*
185
     * Search the html
186
     */
187
47.0k
    while (cur != NULL) {
188
46.7k
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
189
35.8k
      if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
190
273
    break;
191
35.5k
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
192
0
    goto found_head;
193
35.5k
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
194
0
    goto found_meta;
195
35.5k
  }
196
46.5k
  cur = cur->next;
197
46.5k
    }
198
535
    if (cur == NULL)
199
262
  return(-1);
200
273
    cur = cur->children;
201
202
    /*
203
     * Search the head
204
     */
205
1.76k
    while (cur != NULL) {
206
1.51k
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
207
964
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
208
20
    break;
209
944
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
210
0
                head = cur->parent;
211
0
    goto found_meta;
212
0
            }
213
944
  }
214
1.49k
  cur = cur->next;
215
1.49k
    }
216
273
    if (cur == NULL)
217
253
  return(-1);
218
20
found_head:
219
20
    head = cur;
220
20
    if (cur->children == NULL)
221
1
        goto create;
222
19
    cur = cur->children;
223
224
19
found_meta:
225
    /*
226
     * Search and update all the remaining the meta elements carrying
227
     * encoding information
228
     */
229
86
    while (cur != NULL) {
230
67
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
231
52
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
232
0
    xmlAttrPtr attr = cur->properties;
233
0
    int http;
234
0
    const xmlChar *value;
235
236
0
    content = NULL;
237
0
    http = 0;
238
0
    while (attr != NULL) {
239
0
        if ((attr->children != NULL) &&
240
0
            (attr->children->type == XML_TEXT_NODE) &&
241
0
            (attr->children->next == NULL)) {
242
0
      value = attr->children->content;
243
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
244
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
245
0
          http = 1;
246
0
      else
247
0
                        {
248
0
                           if ((value != NULL) &&
249
0
                               (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
250
0
             content = value;
251
0
                        }
252
0
            if ((http != 0) && (content != NULL))
253
0
          break;
254
0
        }
255
0
        attr = attr->next;
256
0
    }
257
0
    if ((http != 0) && (content != NULL)) {
258
0
        meta = cur;
259
0
        break;
260
0
    }
261
262
0
      }
263
52
  }
264
67
  cur = cur->next;
265
67
    }
266
20
create:
267
20
    if (meta == NULL) {
268
20
        if ((encoding != NULL) && (head != NULL)) {
269
            /*
270
             * Create a new Meta element with the right attributes
271
             */
272
273
20
            meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
274
20
            if (head->children == NULL)
275
1
                xmlAddChild(head, meta);
276
19
            else
277
19
                xmlAddPrevSibling(head->children, meta);
278
20
            xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
279
20
            xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
280
20
        }
281
20
    } else {
282
        /* remove the meta tag if NULL is passed */
283
0
        if (encoding == NULL) {
284
0
            xmlUnlinkNode(meta);
285
0
            xmlFreeNode(meta);
286
0
        }
287
        /* change the document only if there is a real encoding change */
288
0
        else if (xmlStrcasestr(content, encoding) == NULL) {
289
0
            xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
290
0
        }
291
0
    }
292
293
294
20
    return(0);
295
19
}
296
297
/**
298
 * booleanHTMLAttrs:
299
 *
300
 * These are the HTML attributes which will be output
301
 * in minimized form, i.e. <option selected="selected"> will be
302
 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
303
 *
304
 */
305
static const char* const htmlBooleanAttrs[] = {
306
  "checked", "compact", "declare", "defer", "disabled", "ismap",
307
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
308
  "selected", NULL
309
};
310
311
312
/**
313
 * htmlIsBooleanAttr:
314
 * @name:  the name of the attribute to check
315
 *
316
 * Determine if a given attribute is a boolean attribute.
317
 *
318
 * returns: false if the attribute is not boolean, true otherwise.
319
 */
320
int
321
htmlIsBooleanAttr(const xmlChar *name)
322
148k
{
323
148k
    int i = 0;
324
325
2.07M
    while (htmlBooleanAttrs[i] != NULL) {
326
1.93M
        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
327
110
            return 1;
328
1.93M
        i++;
329
1.93M
    }
330
148k
    return 0;
331
148k
}
332
333
#ifdef LIBXML_OUTPUT_ENABLED
334
/************************************************************************
335
 *                  *
336
 *      Output error handlers       *
337
 *                  *
338
 ************************************************************************/
339
/**
340
 * htmlSaveErrMemory:
341
 * @extra:  extra information
342
 *
343
 * Handle an out of memory condition
344
 */
345
static void
346
htmlSaveErrMemory(const char *extra)
347
0
{
348
0
    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
349
0
}
350
351
/**
352
 * htmlSaveErr:
353
 * @code:  the error number
354
 * @node:  the location of the error.
355
 * @extra:  extra information
356
 *
357
 * Handle an out of memory condition
358
 */
359
static void
360
htmlSaveErr(int code, xmlNodePtr node, const char *extra)
361
0
{
362
0
    const char *msg = NULL;
363
364
0
    switch(code) {
365
0
        case XML_SAVE_NOT_UTF8:
366
0
      msg = "string is not in UTF-8\n";
367
0
      break;
368
0
  case XML_SAVE_CHAR_INVALID:
369
0
      msg = "invalid character value\n";
370
0
      break;
371
0
  case XML_SAVE_UNKNOWN_ENCODING:
372
0
      msg = "unknown encoding %s\n";
373
0
      break;
374
0
  case XML_SAVE_NO_DOCTYPE:
375
0
      msg = "HTML has no DOCTYPE\n";
376
0
      break;
377
0
  default:
378
0
      msg = "unexpected error number\n";
379
0
    }
380
0
    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
381
0
}
382
383
/************************************************************************
384
 *                  *
385
 *    Dumping HTML tree content to a simple buffer    *
386
 *                  *
387
 ************************************************************************/
388
389
/**
390
 * htmlBufNodeDumpFormat:
391
 * @buf:  the xmlBufPtr output
392
 * @doc:  the document
393
 * @cur:  the current node
394
 * @format:  should formatting spaces been added
395
 *
396
 * Dump an HTML node, recursive behaviour,children are printed too.
397
 *
398
 * Returns the number of byte written or -1 in case of error
399
 */
400
static size_t
401
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
402
0
             int format) {
403
0
    size_t use;
404
0
    int ret;
405
0
    xmlOutputBufferPtr outbuf;
406
407
0
    if (cur == NULL) {
408
0
  return (-1);
409
0
    }
410
0
    if (buf == NULL) {
411
0
  return (-1);
412
0
    }
413
0
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
414
0
    if (outbuf == NULL) {
415
0
        htmlSaveErrMemory("allocating HTML output buffer");
416
0
  return (-1);
417
0
    }
418
0
    memset(outbuf, 0, sizeof(xmlOutputBuffer));
419
0
    outbuf->buffer = buf;
420
0
    outbuf->encoder = NULL;
421
0
    outbuf->writecallback = NULL;
422
0
    outbuf->closecallback = NULL;
423
0
    outbuf->context = NULL;
424
0
    outbuf->written = 0;
425
426
0
    use = xmlBufUse(buf);
427
0
    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
428
0
    xmlFree(outbuf);
429
0
    ret = xmlBufUse(buf) - use;
430
0
    return (ret);
431
0
}
432
433
/**
434
 * htmlNodeDump:
435
 * @buf:  the HTML buffer output
436
 * @doc:  the document
437
 * @cur:  the current node
438
 *
439
 * Dump an HTML node, recursive behaviour,children are printed too,
440
 * and formatting returns are added.
441
 *
442
 * Returns the number of byte written or -1 in case of error
443
 */
444
int
445
0
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
446
0
    xmlBufPtr buffer;
447
0
    size_t ret;
448
449
0
    if ((buf == NULL) || (cur == NULL))
450
0
        return(-1);
451
452
0
    xmlInitParser();
453
0
    buffer = xmlBufFromBuffer(buf);
454
0
    if (buffer == NULL)
455
0
        return(-1);
456
457
0
    ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
458
459
0
    xmlBufBackToBuffer(buffer);
460
461
0
    if (ret > INT_MAX)
462
0
        return(-1);
463
0
    return((int) ret);
464
0
}
465
466
/**
467
 * htmlNodeDumpFileFormat:
468
 * @out:  the FILE pointer
469
 * @doc:  the document
470
 * @cur:  the current node
471
 * @encoding: the document encoding
472
 * @format:  should formatting spaces been added
473
 *
474
 * Dump an HTML node, recursive behaviour,children are printed too.
475
 *
476
 * TODO: if encoding == NULL try to save in the doc encoding
477
 *
478
 * returns: the number of byte written or -1 in case of failure.
479
 */
480
int
481
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
482
0
                 xmlNodePtr cur, const char *encoding, int format) {
483
0
    xmlOutputBufferPtr buf;
484
0
    xmlCharEncodingHandlerPtr handler = NULL;
485
0
    int ret;
486
487
0
    xmlInitParser();
488
489
0
    if (encoding != NULL) {
490
0
  xmlCharEncoding enc;
491
492
0
  enc = xmlParseCharEncoding(encoding);
493
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
494
0
      handler = xmlFindCharEncodingHandler(encoding);
495
0
      if (handler == NULL)
496
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
497
0
  }
498
0
    } else {
499
        /*
500
         * Fallback to HTML or ASCII when the encoding is unspecified
501
         */
502
0
        if (handler == NULL)
503
0
            handler = xmlFindCharEncodingHandler("HTML");
504
0
        if (handler == NULL)
505
0
            handler = xmlFindCharEncodingHandler("ascii");
506
0
    }
507
508
    /*
509
     * save the content to a temp buffer.
510
     */
511
0
    buf = xmlOutputBufferCreateFile(out, handler);
512
0
    if (buf == NULL) return(0);
513
514
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
515
516
0
    ret = xmlOutputBufferClose(buf);
517
0
    return(ret);
518
0
}
519
520
/**
521
 * htmlNodeDumpFile:
522
 * @out:  the FILE pointer
523
 * @doc:  the document
524
 * @cur:  the current node
525
 *
526
 * Dump an HTML node, recursive behaviour,children are printed too,
527
 * and formatting returns are added.
528
 */
529
void
530
0
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
531
0
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
532
0
}
533
534
/**
535
 * htmlDocDumpMemoryFormat:
536
 * @cur:  the document
537
 * @mem:  OUT: the memory pointer
538
 * @size:  OUT: the memory length
539
 * @format:  should formatting spaces been added
540
 *
541
 * Dump an HTML document in memory and return the xmlChar * and it's size.
542
 * It's up to the caller to free the memory.
543
 */
544
void
545
0
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
546
0
    xmlOutputBufferPtr buf;
547
0
    xmlCharEncodingHandlerPtr handler = NULL;
548
0
    const char *encoding;
549
550
0
    xmlInitParser();
551
552
0
    if ((mem == NULL) || (size == NULL))
553
0
        return;
554
0
    if (cur == NULL) {
555
0
  *mem = NULL;
556
0
  *size = 0;
557
0
  return;
558
0
    }
559
560
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
561
562
0
    if (encoding != NULL) {
563
0
  xmlCharEncoding enc;
564
565
0
  enc = xmlParseCharEncoding(encoding);
566
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
567
0
      handler = xmlFindCharEncodingHandler(encoding);
568
0
      if (handler == NULL)
569
0
                htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
570
571
0
  }
572
0
    } else {
573
        /*
574
         * Fallback to HTML or ASCII when the encoding is unspecified
575
         */
576
0
        if (handler == NULL)
577
0
            handler = xmlFindCharEncodingHandler("HTML");
578
0
        if (handler == NULL)
579
0
            handler = xmlFindCharEncodingHandler("ascii");
580
0
    }
581
582
0
    buf = xmlAllocOutputBufferInternal(handler);
583
0
    if (buf == NULL) {
584
0
  *mem = NULL;
585
0
  *size = 0;
586
0
  return;
587
0
    }
588
589
0
    htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
590
591
0
    xmlOutputBufferFlush(buf);
592
0
    if (buf->conv != NULL) {
593
0
  *size = xmlBufUse(buf->conv);
594
0
  *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
595
0
    } else {
596
0
  *size = xmlBufUse(buf->buffer);
597
0
  *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
598
0
    }
599
0
    (void)xmlOutputBufferClose(buf);
600
0
}
601
602
/**
603
 * htmlDocDumpMemory:
604
 * @cur:  the document
605
 * @mem:  OUT: the memory pointer
606
 * @size:  OUT: the memory length
607
 *
608
 * Dump an HTML document in memory and return the xmlChar * and it's size.
609
 * It's up to the caller to free the memory.
610
 */
611
void
612
0
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
613
0
  htmlDocDumpMemoryFormat(cur, mem, size, 1);
614
0
}
615
616
617
/************************************************************************
618
 *                  *
619
 *    Dumping HTML tree content to an I/O output buffer *
620
 *                  *
621
 ************************************************************************/
622
623
/**
624
 * htmlDtdDumpOutput:
625
 * @buf:  the HTML buffer output
626
 * @doc:  the document
627
 * @encoding:  the encoding string
628
 *
629
 * TODO: check whether encoding is needed
630
 *
631
 * Dump the HTML document DTD, if any.
632
 */
633
static void
634
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
635
0
            const char *encoding ATTRIBUTE_UNUSED) {
636
0
    xmlDtdPtr cur = doc->intSubset;
637
638
0
    if (cur == NULL) {
639
0
  htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
640
0
  return;
641
0
    }
642
0
    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
643
0
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
644
0
    if (cur->ExternalID != NULL) {
645
0
  xmlOutputBufferWriteString(buf, " PUBLIC ");
646
0
  xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
647
0
  if (cur->SystemID != NULL) {
648
0
      xmlOutputBufferWriteString(buf, " ");
649
0
      xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
650
0
  }
651
0
    } else if (cur->SystemID != NULL &&
652
0
         xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
653
0
  xmlOutputBufferWriteString(buf, " SYSTEM ");
654
0
  xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
655
0
    }
656
0
    xmlOutputBufferWriteString(buf, ">\n");
657
0
}
658
659
/**
660
 * htmlAttrDumpOutput:
661
 * @buf:  the HTML buffer output
662
 * @doc:  the document
663
 * @cur:  the attribute pointer
664
 *
665
 * Dump an HTML attribute
666
 */
667
static void
668
148k
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
669
148k
    xmlChar *value;
670
671
    /*
672
     * The html output method should not escape a & character
673
     * occurring in an attribute value immediately followed by
674
     * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
675
     * This is implemented in xmlEncodeEntitiesReentrant
676
     */
677
678
148k
    if (cur == NULL) {
679
0
  return;
680
0
    }
681
148k
    xmlOutputBufferWriteString(buf, " ");
682
148k
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
683
253
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
684
253
  xmlOutputBufferWriteString(buf, ":");
685
253
    }
686
148k
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
687
148k
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
688
148k
  value = xmlNodeListGetString(doc, cur->children, 0);
689
148k
  if (value) {
690
146k
      xmlOutputBufferWriteString(buf, "=");
691
146k
      if ((cur->ns == NULL) && (cur->parent != NULL) &&
692
146k
    (cur->parent->ns == NULL) &&
693
146k
    ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
694
146k
           (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
695
146k
     (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
696
146k
     ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
697
145k
      (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
698
903
    xmlChar *escaped;
699
903
    xmlChar *tmp = value;
700
701
903
    while (IS_BLANK_CH(*tmp)) tmp++;
702
703
    /*
704
                 * Angle brackets are technically illegal in URIs, but they're
705
                 * used in server side includes, for example. Curly brackets
706
                 * are illegal as well and often used in templates.
707
                 * Don't escape non-whitespace, printable ASCII chars for
708
                 * improved interoperability. Only escape space, control
709
                 * and non-ASCII chars.
710
     */
711
903
    escaped = xmlURIEscapeStr(tmp,
712
903
                        BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
713
903
    if (escaped != NULL) {
714
899
        xmlBufWriteQuotedString(buf->buffer, escaped);
715
899
        xmlFree(escaped);
716
899
    } else {
717
4
        xmlBufWriteQuotedString(buf->buffer, value);
718
4
    }
719
145k
      } else {
720
145k
    xmlBufWriteQuotedString(buf->buffer, value);
721
145k
      }
722
146k
      xmlFree(value);
723
146k
  } else  {
724
1.98k
      xmlOutputBufferWriteString(buf, "=\"\"");
725
1.98k
  }
726
148k
    }
727
148k
}
728
729
/**
730
 * htmlNodeDumpFormatOutput:
731
 * @buf:  the HTML buffer output
732
 * @doc:  the document
733
 * @cur:  the current node
734
 * @encoding:  the encoding string (unused)
735
 * @format:  should formatting spaces been added
736
 *
737
 * Dump an HTML node, recursive behaviour,children are printed too.
738
 */
739
void
740
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
741
                   xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
742
535
                         int format) {
743
535
    xmlNodePtr root, parent;
744
535
    xmlAttrPtr attr;
745
535
    const htmlElemDesc * info;
746
747
535
    xmlInitParser();
748
749
535
    if ((cur == NULL) || (buf == NULL)) {
750
0
  return;
751
0
    }
752
753
535
    root = cur;
754
535
    parent = cur->parent;
755
204k
    while (1) {
756
204k
        switch (cur->type) {
757
535
        case XML_HTML_DOCUMENT_NODE:
758
535
        case XML_DOCUMENT_NODE:
759
535
            if (((xmlDocPtr) cur)->intSubset != NULL) {
760
0
                htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
761
0
            }
762
535
            if (cur->children != NULL) {
763
                /* Always validate cur->parent when descending. */
764
535
                if (cur->parent == parent) {
765
535
                    parent = cur;
766
535
                    cur = cur->children;
767
535
                    continue;
768
535
                }
769
535
            } else {
770
0
                xmlOutputBufferWriteString(buf, "\n");
771
0
            }
772
0
            break;
773
774
180k
        case XML_ELEMENT_NODE:
775
            /*
776
             * Some users like lxml are known to pass nodes with a corrupted
777
             * tree structure. Fall back to a recursive call to handle this
778
             * case.
779
             */
780
180k
            if ((cur->parent != parent) && (cur->children != NULL)) {
781
0
                htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
782
0
                break;
783
0
            }
784
785
            /*
786
             * Get specific HTML info for that node.
787
             */
788
180k
            if (cur->ns == NULL)
789
179k
                info = htmlTagLookup(cur->name);
790
912
            else
791
912
                info = NULL;
792
793
180k
            xmlOutputBufferWriteString(buf, "<");
794
180k
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
795
114
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
796
114
                xmlOutputBufferWriteString(buf, ":");
797
114
            }
798
180k
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
799
180k
            if (cur->nsDef)
800
35.8k
                xmlNsListDumpOutput(buf, cur->nsDef);
801
180k
            attr = cur->properties;
802
329k
            while (attr != NULL) {
803
148k
                htmlAttrDumpOutput(buf, doc, attr);
804
148k
                attr = attr->next;
805
148k
            }
806
807
180k
            if ((info != NULL) && (info->empty)) {
808
19
                xmlOutputBufferWriteString(buf, ">");
809
180k
            } else if (cur->children == NULL) {
810
149k
                if ((info != NULL) && (info->saveEndTag != 0) &&
811
149k
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
812
149k
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
813
0
                    xmlOutputBufferWriteString(buf, ">");
814
149k
                } else {
815
149k
                    xmlOutputBufferWriteString(buf, "></");
816
149k
                    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
817
101
                        xmlOutputBufferWriteString(buf,
818
101
                                (const char *)cur->ns->prefix);
819
101
                        xmlOutputBufferWriteString(buf, ":");
820
101
                    }
821
149k
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
822
149k
                    xmlOutputBufferWriteString(buf, ">");
823
149k
                }
824
149k
            } else {
825
31.8k
                xmlOutputBufferWriteString(buf, ">");
826
31.8k
                if ((format) && (info != NULL) && (!info->isinline) &&
827
31.8k
                    (cur->children->type != HTML_TEXT_NODE) &&
828
31.8k
                    (cur->children->type != HTML_ENTITY_REF_NODE) &&
829
31.8k
                    (cur->children != cur->last) &&
830
31.8k
                    (cur->name != NULL) &&
831
31.8k
                    (cur->name[0] != 'p')) /* p, pre, param */
832
1.29k
                    xmlOutputBufferWriteString(buf, "\n");
833
31.8k
                parent = cur;
834
31.8k
                cur = cur->children;
835
31.8k
                continue;
836
31.8k
            }
837
838
149k
            if ((format) && (cur->next != NULL) &&
839
149k
                (info != NULL) && (!info->isinline)) {
840
2.52k
                if ((cur->next->type != HTML_TEXT_NODE) &&
841
2.52k
                    (cur->next->type != HTML_ENTITY_REF_NODE) &&
842
2.52k
                    (parent != NULL) &&
843
2.52k
                    (parent->name != NULL) &&
844
2.52k
                    (parent->name[0] != 'p')) /* p, pre, param */
845
1.25k
                    xmlOutputBufferWriteString(buf, "\n");
846
2.52k
            }
847
848
149k
            break;
849
850
0
        case XML_ATTRIBUTE_NODE:
851
0
            htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
852
0
            break;
853
854
19.9k
        case HTML_TEXT_NODE:
855
19.9k
            if (cur->content == NULL)
856
0
                break;
857
19.9k
            if (((cur->name == (const xmlChar *)xmlStringText) ||
858
19.9k
                 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
859
19.9k
                ((parent == NULL) ||
860
19.9k
                 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
861
19.9k
                  (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
862
19.9k
                xmlChar *buffer;
863
864
19.9k
                buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
865
19.9k
                if (buffer != NULL) {
866
18.1k
                    xmlOutputBufferWriteString(buf, (const char *)buffer);
867
18.1k
                    xmlFree(buffer);
868
18.1k
                }
869
19.9k
            } else {
870
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
871
0
            }
872
19.9k
            break;
873
874
1.54k
        case HTML_COMMENT_NODE:
875
1.54k
            if (cur->content != NULL) {
876
1.53k
                xmlOutputBufferWriteString(buf, "<!--");
877
1.53k
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
878
1.53k
                xmlOutputBufferWriteString(buf, "-->");
879
1.53k
            }
880
1.54k
            break;
881
882
1.48k
        case HTML_PI_NODE:
883
1.48k
            if (cur->name != NULL) {
884
1.48k
                xmlOutputBufferWriteString(buf, "<?");
885
1.48k
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
886
1.48k
                if (cur->content != NULL) {
887
1.48k
                    xmlOutputBufferWriteString(buf, " ");
888
1.48k
                    xmlOutputBufferWriteString(buf,
889
1.48k
                            (const char *)cur->content);
890
1.48k
                }
891
1.48k
                xmlOutputBufferWriteString(buf, ">");
892
1.48k
            }
893
1.48k
            break;
894
895
0
        case HTML_ENTITY_REF_NODE:
896
0
            xmlOutputBufferWriteString(buf, "&");
897
0
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
898
0
            xmlOutputBufferWriteString(buf, ";");
899
0
            break;
900
901
0
        case HTML_PRESERVE_NODE:
902
0
            if (cur->content != NULL) {
903
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
904
0
            }
905
0
            break;
906
907
0
        default:
908
0
            break;
909
204k
        }
910
911
204k
        while (1) {
912
204k
            if (cur == root)
913
535
                return;
914
203k
            if (cur->next != NULL) {
915
171k
                cur = cur->next;
916
171k
                break;
917
171k
            }
918
919
32.3k
            cur = parent;
920
            /* cur->parent was validated when descending. */
921
32.3k
            parent = cur->parent;
922
923
32.3k
            if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
924
32.3k
                (cur->type == XML_DOCUMENT_NODE)) {
925
535
                xmlOutputBufferWriteString(buf, "\n");
926
31.8k
            } else {
927
31.8k
                if ((format) && (cur->ns == NULL))
928
31.4k
                    info = htmlTagLookup(cur->name);
929
404
                else
930
404
                    info = NULL;
931
932
31.8k
                if ((format) && (info != NULL) && (!info->isinline) &&
933
31.8k
                    (cur->last->type != HTML_TEXT_NODE) &&
934
31.8k
                    (cur->last->type != HTML_ENTITY_REF_NODE) &&
935
31.8k
                    (cur->children != cur->last) &&
936
31.8k
                    (cur->name != NULL) &&
937
31.8k
                    (cur->name[0] != 'p')) /* p, pre, param */
938
1.49k
                    xmlOutputBufferWriteString(buf, "\n");
939
940
31.8k
                xmlOutputBufferWriteString(buf, "</");
941
31.8k
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
942
13
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
943
13
                    xmlOutputBufferWriteString(buf, ":");
944
13
                }
945
31.8k
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
946
31.8k
                xmlOutputBufferWriteString(buf, ">");
947
948
31.8k
                if ((format) && (info != NULL) && (!info->isinline) &&
949
31.8k
                    (cur->next != NULL)) {
950
3.24k
                    if ((cur->next->type != HTML_TEXT_NODE) &&
951
3.24k
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
952
3.24k
                        (parent != NULL) &&
953
3.24k
                        (parent->name != NULL) &&
954
3.24k
                        (parent->name[0] != 'p')) /* p, pre, param */
955
1.84k
                        xmlOutputBufferWriteString(buf, "\n");
956
3.24k
                }
957
31.8k
            }
958
32.3k
        }
959
172k
    }
960
535
}
961
962
/**
963
 * htmlNodeDumpOutput:
964
 * @buf:  the HTML buffer output
965
 * @doc:  the document
966
 * @cur:  the current node
967
 * @encoding:  the encoding string (unused)
968
 *
969
 * Dump an HTML node, recursive behaviour,children are printed too,
970
 * and formatting returns/spaces are added.
971
 */
972
void
973
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
974
0
             xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
975
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
976
0
}
977
978
/**
979
 * htmlDocContentDumpFormatOutput:
980
 * @buf:  the HTML buffer output
981
 * @cur:  the document
982
 * @encoding:  the encoding string (unused)
983
 * @format:  should formatting spaces been added
984
 *
985
 * Dump an HTML document.
986
 */
987
void
988
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
989
                         const char *encoding ATTRIBUTE_UNUSED,
990
535
                               int format) {
991
535
    int type = 0;
992
535
    if (cur) {
993
535
        type = cur->type;
994
535
        cur->type = XML_HTML_DOCUMENT_NODE;
995
535
    }
996
535
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
997
535
    if (cur)
998
535
        cur->type = (xmlElementType) type;
999
535
}
1000
1001
/**
1002
 * htmlDocContentDumpOutput:
1003
 * @buf:  the HTML buffer output
1004
 * @cur:  the document
1005
 * @encoding:  the encoding string (unused)
1006
 *
1007
 * Dump an HTML document. Formatting return/spaces are added.
1008
 */
1009
void
1010
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1011
0
                   const char *encoding ATTRIBUTE_UNUSED) {
1012
0
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1013
0
}
1014
1015
/************************************************************************
1016
 *                  *
1017
 *    Saving functions front-ends       *
1018
 *                  *
1019
 ************************************************************************/
1020
1021
/**
1022
 * htmlDocDump:
1023
 * @f:  the FILE*
1024
 * @cur:  the document
1025
 *
1026
 * Dump an HTML document to an open FILE.
1027
 *
1028
 * returns: the number of byte written or -1 in case of failure.
1029
 */
1030
int
1031
0
htmlDocDump(FILE *f, xmlDocPtr cur) {
1032
0
    xmlOutputBufferPtr buf;
1033
0
    xmlCharEncodingHandlerPtr handler = NULL;
1034
0
    const char *encoding;
1035
0
    int ret;
1036
1037
0
    xmlInitParser();
1038
1039
0
    if ((cur == NULL) || (f == NULL)) {
1040
0
  return(-1);
1041
0
    }
1042
1043
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
1044
1045
0
    if (encoding != NULL) {
1046
0
  xmlCharEncoding enc;
1047
1048
0
  enc = xmlParseCharEncoding(encoding);
1049
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1050
0
      handler = xmlFindCharEncodingHandler(encoding);
1051
0
      if (handler == NULL)
1052
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1053
0
  }
1054
0
    } else {
1055
        /*
1056
         * Fallback to HTML or ASCII when the encoding is unspecified
1057
         */
1058
0
        if (handler == NULL)
1059
0
            handler = xmlFindCharEncodingHandler("HTML");
1060
0
        if (handler == NULL)
1061
0
            handler = xmlFindCharEncodingHandler("ascii");
1062
0
    }
1063
1064
0
    buf = xmlOutputBufferCreateFile(f, handler);
1065
0
    if (buf == NULL) return(-1);
1066
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1067
1068
0
    ret = xmlOutputBufferClose(buf);
1069
0
    return(ret);
1070
0
}
1071
1072
/**
1073
 * htmlSaveFile:
1074
 * @filename:  the filename (or URL)
1075
 * @cur:  the document
1076
 *
1077
 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1078
 * used.
1079
 * returns: the number of byte written or -1 in case of failure.
1080
 */
1081
int
1082
0
htmlSaveFile(const char *filename, xmlDocPtr cur) {
1083
0
    xmlOutputBufferPtr buf;
1084
0
    xmlCharEncodingHandlerPtr handler = NULL;
1085
0
    const char *encoding;
1086
0
    int ret;
1087
1088
0
    if ((cur == NULL) || (filename == NULL))
1089
0
        return(-1);
1090
1091
0
    xmlInitParser();
1092
1093
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
1094
1095
0
    if (encoding != NULL) {
1096
0
  xmlCharEncoding enc;
1097
1098
0
  enc = xmlParseCharEncoding(encoding);
1099
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1100
0
      handler = xmlFindCharEncodingHandler(encoding);
1101
0
      if (handler == NULL)
1102
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1103
0
  }
1104
0
    } else {
1105
        /*
1106
         * Fallback to HTML or ASCII when the encoding is unspecified
1107
         */
1108
0
        if (handler == NULL)
1109
0
            handler = xmlFindCharEncodingHandler("HTML");
1110
0
        if (handler == NULL)
1111
0
            handler = xmlFindCharEncodingHandler("ascii");
1112
0
    }
1113
1114
    /*
1115
     * save the content to a temp buffer.
1116
     */
1117
0
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1118
0
    if (buf == NULL) return(0);
1119
1120
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1121
1122
0
    ret = xmlOutputBufferClose(buf);
1123
0
    return(ret);
1124
0
}
1125
1126
/**
1127
 * htmlSaveFileFormat:
1128
 * @filename:  the filename
1129
 * @cur:  the document
1130
 * @format:  should formatting spaces been added
1131
 * @encoding: the document encoding
1132
 *
1133
 * Dump an HTML document to a file using a given encoding.
1134
 *
1135
 * returns: the number of byte written or -1 in case of failure.
1136
 */
1137
int
1138
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1139
0
             const char *encoding, int format) {
1140
0
    xmlOutputBufferPtr buf;
1141
0
    xmlCharEncodingHandlerPtr handler = NULL;
1142
0
    int ret;
1143
1144
0
    if ((cur == NULL) || (filename == NULL))
1145
0
        return(-1);
1146
1147
0
    xmlInitParser();
1148
1149
0
    if (encoding != NULL) {
1150
0
  xmlCharEncoding enc;
1151
1152
0
  enc = xmlParseCharEncoding(encoding);
1153
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1154
0
      handler = xmlFindCharEncodingHandler(encoding);
1155
0
      if (handler == NULL)
1156
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1157
0
  }
1158
0
        htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1159
0
    } else {
1160
0
  htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1161
1162
        /*
1163
         * Fallback to HTML or ASCII when the encoding is unspecified
1164
         */
1165
0
        if (handler == NULL)
1166
0
            handler = xmlFindCharEncodingHandler("HTML");
1167
0
        if (handler == NULL)
1168
0
            handler = xmlFindCharEncodingHandler("ascii");
1169
0
    }
1170
1171
    /*
1172
     * save the content to a temp buffer.
1173
     */
1174
0
    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1175
0
    if (buf == NULL) return(0);
1176
1177
0
    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1178
1179
0
    ret = xmlOutputBufferClose(buf);
1180
0
    return(ret);
1181
0
}
1182
1183
/**
1184
 * htmlSaveFileEnc:
1185
 * @filename:  the filename
1186
 * @cur:  the document
1187
 * @encoding: the document encoding
1188
 *
1189
 * Dump an HTML document to a file using a given encoding
1190
 * and formatting returns/spaces are added.
1191
 *
1192
 * returns: the number of byte written or -1 in case of failure.
1193
 */
1194
int
1195
0
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1196
0
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1197
0
}
1198
1199
#endif /* LIBXML_OUTPUT_ENABLED */
1200
1201
#endif /* LIBXML_HTML_ENABLED */