Coverage Report

Created: 2025-07-23 08:13

/src/fontconfig/subprojects/libxml2-2.12.6/HTMLtree.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLtree.c : implementation of access function for an HTML tree.
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13
14
#include <string.h> /* for memset() only ! */
15
#include <ctype.h>
16
#include <stdlib.h>
17
18
#include <libxml/xmlmemory.h>
19
#include <libxml/HTMLparser.h>
20
#include <libxml/HTMLtree.h>
21
#include <libxml/entities.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/parserInternals.h>
24
#include <libxml/uri.h>
25
26
#include "private/buf.h"
27
#include "private/error.h"
28
#include "private/io.h"
29
#include "private/save.h"
30
31
/************************************************************************
32
 *                  *
33
 *    Getting/Setting encoding meta tags      *
34
 *                  *
35
 ************************************************************************/
36
37
/**
38
 * htmlGetMetaEncoding:
39
 * @doc:  the document
40
 *
41
 * Encoding definition lookup in the Meta tags
42
 *
43
 * Returns the current encoding as flagged in the HTML source
44
 */
45
const xmlChar *
46
0
htmlGetMetaEncoding(htmlDocPtr doc) {
47
0
    htmlNodePtr cur;
48
0
    const xmlChar *content;
49
0
    const xmlChar *encoding;
50
51
0
    if (doc == NULL)
52
0
  return(NULL);
53
0
    cur = doc->children;
54
55
    /*
56
     * Search the html
57
     */
58
0
    while (cur != NULL) {
59
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60
0
      if (xmlStrEqual(cur->name, BAD_CAST"html"))
61
0
    break;
62
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
63
0
    goto found_head;
64
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65
0
    goto found_meta;
66
0
  }
67
0
  cur = cur->next;
68
0
    }
69
0
    if (cur == NULL)
70
0
  return(NULL);
71
0
    cur = cur->children;
72
73
    /*
74
     * Search the head
75
     */
76
0
    while (cur != NULL) {
77
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78
0
      if (xmlStrEqual(cur->name, BAD_CAST"head"))
79
0
    break;
80
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81
0
    goto found_meta;
82
0
  }
83
0
  cur = cur->next;
84
0
    }
85
0
    if (cur == NULL)
86
0
  return(NULL);
87
0
found_head:
88
0
    cur = cur->children;
89
90
    /*
91
     * Search the meta elements
92
     */
93
0
found_meta:
94
0
    while (cur != NULL) {
95
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96
0
      if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97
0
    xmlAttrPtr attr = cur->properties;
98
0
    int http;
99
0
    const xmlChar *value;
100
101
0
    content = NULL;
102
0
    http = 0;
103
0
    while (attr != NULL) {
104
0
        if ((attr->children != NULL) &&
105
0
            (attr->children->type == XML_TEXT_NODE) &&
106
0
            (attr->children->next == NULL)) {
107
0
      value = attr->children->content;
108
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110
0
          http = 1;
111
0
      else if ((value != NULL)
112
0
       && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113
0
          content = value;
114
0
      if ((http != 0) && (content != NULL))
115
0
          goto found_content;
116
0
        }
117
0
        attr = attr->next;
118
0
    }
119
0
      }
120
0
  }
121
0
  cur = cur->next;
122
0
    }
123
0
    return(NULL);
124
125
0
found_content:
126
0
    encoding = xmlStrstr(content, BAD_CAST"charset=");
127
0
    if (encoding == NULL)
128
0
  encoding = xmlStrstr(content, BAD_CAST"Charset=");
129
0
    if (encoding == NULL)
130
0
  encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131
0
    if (encoding != NULL) {
132
0
  encoding += 8;
133
0
    } else {
134
0
  encoding = xmlStrstr(content, BAD_CAST"charset =");
135
0
  if (encoding == NULL)
136
0
      encoding = xmlStrstr(content, BAD_CAST"Charset =");
137
0
  if (encoding == NULL)
138
0
      encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139
0
  if (encoding != NULL)
140
0
      encoding += 9;
141
0
    }
142
0
    if (encoding != NULL) {
143
0
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144
0
    }
145
0
    return(encoding);
146
0
}
147
148
/**
149
 * htmlSetMetaEncoding:
150
 * @doc:  the document
151
 * @encoding:  the encoding string
152
 *
153
 * Sets the current encoding in the Meta tags
154
 * NOTE: this will not change the document content encoding, just
155
 * the META flag associated.
156
 *
157
 * Returns 0 in case of success and -1 in case of error
158
 */
159
int
160
0
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161
0
    htmlNodePtr cur, meta = NULL, head = NULL;
162
0
    const xmlChar *content = NULL;
163
0
    char newcontent[100];
164
165
0
    newcontent[0] = 0;
166
167
0
    if (doc == NULL)
168
0
  return(-1);
169
170
    /* html isn't a real encoding it's just libxml2 way to get entities */
171
0
    if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172
0
        return(-1);
173
174
0
    if (encoding != NULL) {
175
0
  snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176
0
                (char *)encoding);
177
0
  newcontent[sizeof(newcontent) - 1] = 0;
178
0
    }
179
180
0
    cur = doc->children;
181
182
    /*
183
     * Search the html
184
     */
185
0
    while (cur != NULL) {
186
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188
0
    break;
189
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190
0
    goto found_head;
191
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192
0
    goto found_meta;
193
0
  }
194
0
  cur = cur->next;
195
0
    }
196
0
    if (cur == NULL)
197
0
  return(-1);
198
0
    cur = cur->children;
199
200
    /*
201
     * Search the head
202
     */
203
0
    while (cur != NULL) {
204
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206
0
    break;
207
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208
0
                head = cur->parent;
209
0
    goto found_meta;
210
0
            }
211
0
  }
212
0
  cur = cur->next;
213
0
    }
214
0
    if (cur == NULL)
215
0
  return(-1);
216
0
found_head:
217
0
    head = cur;
218
0
    if (cur->children == NULL)
219
0
        goto create;
220
0
    cur = cur->children;
221
222
0
found_meta:
223
    /*
224
     * Search and update all the remaining the meta elements carrying
225
     * encoding information
226
     */
227
0
    while (cur != NULL) {
228
0
  if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229
0
      if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230
0
    xmlAttrPtr attr = cur->properties;
231
0
    int http;
232
0
    const xmlChar *value;
233
234
0
    content = NULL;
235
0
    http = 0;
236
0
    while (attr != NULL) {
237
0
        if ((attr->children != NULL) &&
238
0
            (attr->children->type == XML_TEXT_NODE) &&
239
0
            (attr->children->next == NULL)) {
240
0
      value = attr->children->content;
241
0
      if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242
0
       && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243
0
          http = 1;
244
0
      else
245
0
                        {
246
0
                           if ((value != NULL) &&
247
0
                               (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248
0
             content = value;
249
0
                        }
250
0
            if ((http != 0) && (content != NULL))
251
0
          break;
252
0
        }
253
0
        attr = attr->next;
254
0
    }
255
0
    if ((http != 0) && (content != NULL)) {
256
0
        meta = cur;
257
0
        break;
258
0
    }
259
260
0
      }
261
0
  }
262
0
  cur = cur->next;
263
0
    }
264
0
create:
265
0
    if (meta == NULL) {
266
0
        if ((encoding != NULL) && (head != NULL)) {
267
            /*
268
             * Create a new Meta element with the right attributes
269
             */
270
271
0
            meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272
0
            if (head->children == NULL)
273
0
                xmlAddChild(head, meta);
274
0
            else
275
0
                xmlAddPrevSibling(head->children, meta);
276
0
            xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277
0
            xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278
0
        }
279
0
    } else {
280
        /* remove the meta tag if NULL is passed */
281
0
        if (encoding == NULL) {
282
0
            xmlUnlinkNode(meta);
283
0
            xmlFreeNode(meta);
284
0
        }
285
        /* change the document only if there is a real encoding change */
286
0
        else if (xmlStrcasestr(content, encoding) == NULL) {
287
0
            xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288
0
        }
289
0
    }
290
291
292
0
    return(0);
293
0
}
294
295
/**
296
 * booleanHTMLAttrs:
297
 *
298
 * These are the HTML attributes which will be output
299
 * in minimized form, i.e. <option selected="selected"> will be
300
 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301
 *
302
 */
303
static const char* const htmlBooleanAttrs[] = {
304
  "checked", "compact", "declare", "defer", "disabled", "ismap",
305
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306
  "selected", NULL
307
};
308
309
310
/**
311
 * htmlIsBooleanAttr:
312
 * @name:  the name of the attribute to check
313
 *
314
 * Determine if a given attribute is a boolean attribute.
315
 *
316
 * returns: false if the attribute is not boolean, true otherwise.
317
 */
318
int
319
htmlIsBooleanAttr(const xmlChar *name)
320
0
{
321
0
    int i = 0;
322
323
0
    while (htmlBooleanAttrs[i] != NULL) {
324
0
        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325
0
            return 1;
326
0
        i++;
327
0
    }
328
0
    return 0;
329
0
}
330
331
#ifdef LIBXML_OUTPUT_ENABLED
332
/************************************************************************
333
 *                  *
334
 *      Output error handlers       *
335
 *                  *
336
 ************************************************************************/
337
/**
338
 * htmlSaveErrMemory:
339
 * @extra:  extra information
340
 *
341
 * Handle an out of memory condition
342
 */
343
static void
344
htmlSaveErrMemory(const char *extra)
345
0
{
346
0
    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
347
0
}
348
349
/**
350
 * htmlSaveErr:
351
 * @code:  the error number
352
 * @node:  the location of the error.
353
 * @extra:  extra information
354
 *
355
 * Handle an out of memory condition
356
 */
357
static void
358
htmlSaveErr(int code, xmlNodePtr node, const char *extra)
359
0
{
360
0
    const char *msg = NULL;
361
362
0
    switch(code) {
363
0
        case XML_SAVE_NOT_UTF8:
364
0
      msg = "string is not in UTF-8\n";
365
0
      break;
366
0
  case XML_SAVE_CHAR_INVALID:
367
0
      msg = "invalid character value\n";
368
0
      break;
369
0
  case XML_SAVE_UNKNOWN_ENCODING:
370
0
      msg = "unknown encoding %s\n";
371
0
      break;
372
0
  case XML_SAVE_NO_DOCTYPE:
373
0
      msg = "HTML has no DOCTYPE\n";
374
0
      break;
375
0
  default:
376
0
      msg = "unexpected error number\n";
377
0
    }
378
0
    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
379
0
}
380
381
/************************************************************************
382
 *                  *
383
 *    Dumping HTML tree content to a simple buffer    *
384
 *                  *
385
 ************************************************************************/
386
387
/**
388
 * htmlBufNodeDumpFormat:
389
 * @buf:  the xmlBufPtr output
390
 * @doc:  the document
391
 * @cur:  the current node
392
 * @format:  should formatting spaces been added
393
 *
394
 * Dump an HTML node, recursive behaviour,children are printed too.
395
 *
396
 * Returns the number of byte written or -1 in case of error
397
 */
398
static size_t
399
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
400
0
             int format) {
401
0
    size_t use;
402
0
    int ret;
403
0
    xmlOutputBufferPtr outbuf;
404
405
0
    if (cur == NULL) {
406
0
  return (-1);
407
0
    }
408
0
    if (buf == NULL) {
409
0
  return (-1);
410
0
    }
411
0
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
412
0
    if (outbuf == NULL) {
413
0
        htmlSaveErrMemory("allocating HTML output buffer");
414
0
  return (-1);
415
0
    }
416
0
    memset(outbuf, 0, sizeof(xmlOutputBuffer));
417
0
    outbuf->buffer = buf;
418
0
    outbuf->encoder = NULL;
419
0
    outbuf->writecallback = NULL;
420
0
    outbuf->closecallback = NULL;
421
0
    outbuf->context = NULL;
422
0
    outbuf->written = 0;
423
424
0
    use = xmlBufUse(buf);
425
0
    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
426
0
    xmlFree(outbuf);
427
0
    ret = xmlBufUse(buf) - use;
428
0
    return (ret);
429
0
}
430
431
/**
432
 * htmlNodeDump:
433
 * @buf:  the HTML buffer output
434
 * @doc:  the document
435
 * @cur:  the current node
436
 *
437
 * Dump an HTML node, recursive behaviour,children are printed too,
438
 * and formatting returns are added.
439
 *
440
 * Returns the number of byte written or -1 in case of error
441
 */
442
int
443
0
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
444
0
    xmlBufPtr buffer;
445
0
    size_t ret;
446
447
0
    if ((buf == NULL) || (cur == NULL))
448
0
        return(-1);
449
450
0
    xmlInitParser();
451
0
    buffer = xmlBufFromBuffer(buf);
452
0
    if (buffer == NULL)
453
0
        return(-1);
454
455
0
    ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
456
457
0
    xmlBufBackToBuffer(buffer);
458
459
0
    if (ret > INT_MAX)
460
0
        return(-1);
461
0
    return((int) ret);
462
0
}
463
464
/**
465
 * htmlNodeDumpFileFormat:
466
 * @out:  the FILE pointer
467
 * @doc:  the document
468
 * @cur:  the current node
469
 * @encoding: the document encoding
470
 * @format:  should formatting spaces been added
471
 *
472
 * Dump an HTML node, recursive behaviour,children are printed too.
473
 *
474
 * TODO: if encoding == NULL try to save in the doc encoding
475
 *
476
 * returns: the number of byte written or -1 in case of failure.
477
 */
478
int
479
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
480
0
                 xmlNodePtr cur, const char *encoding, int format) {
481
0
    xmlOutputBufferPtr buf;
482
0
    xmlCharEncodingHandlerPtr handler = NULL;
483
0
    int ret;
484
485
0
    xmlInitParser();
486
487
0
    if (encoding != NULL) {
488
0
  xmlCharEncoding enc;
489
490
0
  enc = xmlParseCharEncoding(encoding);
491
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
492
0
      handler = xmlFindCharEncodingHandler(encoding);
493
0
      if (handler == NULL)
494
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
495
0
  }
496
0
    } else {
497
        /*
498
         * Fallback to HTML or ASCII when the encoding is unspecified
499
         */
500
0
        if (handler == NULL)
501
0
            handler = xmlFindCharEncodingHandler("HTML");
502
0
        if (handler == NULL)
503
0
            handler = xmlFindCharEncodingHandler("ascii");
504
0
    }
505
506
    /*
507
     * save the content to a temp buffer.
508
     */
509
0
    buf = xmlOutputBufferCreateFile(out, handler);
510
0
    if (buf == NULL) return(0);
511
512
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
513
514
0
    ret = xmlOutputBufferClose(buf);
515
0
    return(ret);
516
0
}
517
518
/**
519
 * htmlNodeDumpFile:
520
 * @out:  the FILE pointer
521
 * @doc:  the document
522
 * @cur:  the current node
523
 *
524
 * Dump an HTML node, recursive behaviour,children are printed too,
525
 * and formatting returns are added.
526
 */
527
void
528
0
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
529
0
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
530
0
}
531
532
/**
533
 * htmlDocDumpMemoryFormat:
534
 * @cur:  the document
535
 * @mem:  OUT: the memory pointer
536
 * @size:  OUT: the memory length
537
 * @format:  should formatting spaces been added
538
 *
539
 * Dump an HTML document in memory and return the xmlChar * and it's size.
540
 * It's up to the caller to free the memory.
541
 */
542
void
543
0
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
544
0
    xmlOutputBufferPtr buf;
545
0
    xmlCharEncodingHandlerPtr handler = NULL;
546
0
    const char *encoding;
547
548
0
    xmlInitParser();
549
550
0
    if ((mem == NULL) || (size == NULL))
551
0
        return;
552
0
    if (cur == NULL) {
553
0
  *mem = NULL;
554
0
  *size = 0;
555
0
  return;
556
0
    }
557
558
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
559
560
0
    if (encoding != NULL) {
561
0
  xmlCharEncoding enc;
562
563
0
  enc = xmlParseCharEncoding(encoding);
564
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
565
0
      handler = xmlFindCharEncodingHandler(encoding);
566
0
      if (handler == NULL)
567
0
                htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
568
569
0
  }
570
0
    } else {
571
        /*
572
         * Fallback to HTML or ASCII when the encoding is unspecified
573
         */
574
0
        if (handler == NULL)
575
0
            handler = xmlFindCharEncodingHandler("HTML");
576
0
        if (handler == NULL)
577
0
            handler = xmlFindCharEncodingHandler("ascii");
578
0
    }
579
580
0
    buf = xmlAllocOutputBufferInternal(handler);
581
0
    if (buf == NULL) {
582
0
  *mem = NULL;
583
0
  *size = 0;
584
0
  return;
585
0
    }
586
587
0
    htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
588
589
0
    xmlOutputBufferFlush(buf);
590
0
    if (buf->conv != NULL) {
591
0
  *size = xmlBufUse(buf->conv);
592
0
  *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
593
0
    } else {
594
0
  *size = xmlBufUse(buf->buffer);
595
0
  *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
596
0
    }
597
0
    (void)xmlOutputBufferClose(buf);
598
0
}
599
600
/**
601
 * htmlDocDumpMemory:
602
 * @cur:  the document
603
 * @mem:  OUT: the memory pointer
604
 * @size:  OUT: the memory length
605
 *
606
 * Dump an HTML document in memory and return the xmlChar * and it's size.
607
 * It's up to the caller to free the memory.
608
 */
609
void
610
0
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
611
0
  htmlDocDumpMemoryFormat(cur, mem, size, 1);
612
0
}
613
614
615
/************************************************************************
616
 *                  *
617
 *    Dumping HTML tree content to an I/O output buffer *
618
 *                  *
619
 ************************************************************************/
620
621
/**
622
 * htmlDtdDumpOutput:
623
 * @buf:  the HTML buffer output
624
 * @doc:  the document
625
 * @encoding:  the encoding string
626
 *
627
 * TODO: check whether encoding is needed
628
 *
629
 * Dump the HTML document DTD, if any.
630
 */
631
static void
632
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
633
0
            const char *encoding ATTRIBUTE_UNUSED) {
634
0
    xmlDtdPtr cur = doc->intSubset;
635
636
0
    if (cur == NULL) {
637
0
  htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
638
0
  return;
639
0
    }
640
0
    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
641
0
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
642
0
    if (cur->ExternalID != NULL) {
643
0
  xmlOutputBufferWriteString(buf, " PUBLIC ");
644
0
  xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
645
0
  if (cur->SystemID != NULL) {
646
0
      xmlOutputBufferWriteString(buf, " ");
647
0
      xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
648
0
  }
649
0
    } else if (cur->SystemID != NULL &&
650
0
         xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
651
0
  xmlOutputBufferWriteString(buf, " SYSTEM ");
652
0
  xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
653
0
    }
654
0
    xmlOutputBufferWriteString(buf, ">\n");
655
0
}
656
657
/**
658
 * htmlAttrDumpOutput:
659
 * @buf:  the HTML buffer output
660
 * @doc:  the document
661
 * @cur:  the attribute pointer
662
 *
663
 * Dump an HTML attribute
664
 */
665
static void
666
0
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
667
0
    xmlChar *value;
668
669
    /*
670
     * The html output method should not escape a & character
671
     * occurring in an attribute value immediately followed by
672
     * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
673
     * This is implemented in xmlEncodeEntitiesReentrant
674
     */
675
676
0
    if (cur == NULL) {
677
0
  return;
678
0
    }
679
0
    xmlOutputBufferWriteString(buf, " ");
680
0
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
681
0
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
682
0
  xmlOutputBufferWriteString(buf, ":");
683
0
    }
684
0
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
685
0
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
686
0
  value = xmlNodeListGetString(doc, cur->children, 0);
687
0
  if (value) {
688
0
      xmlOutputBufferWriteString(buf, "=");
689
0
      if ((cur->ns == NULL) && (cur->parent != NULL) &&
690
0
    (cur->parent->ns == NULL) &&
691
0
    ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
692
0
           (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
693
0
     (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
694
0
     ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
695
0
      (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
696
0
    xmlChar *escaped;
697
0
    xmlChar *tmp = value;
698
699
0
    while (IS_BLANK_CH(*tmp)) tmp++;
700
701
    /*
702
                 * Angle brackets are technically illegal in URIs, but they're
703
                 * used in server side includes, for example. Curly brackets
704
                 * are illegal as well and often used in templates.
705
                 * Don't escape non-whitespace, printable ASCII chars for
706
                 * improved interoperability. Only escape space, control
707
                 * and non-ASCII chars.
708
     */
709
0
    escaped = xmlURIEscapeStr(tmp,
710
0
                        BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
711
0
    if (escaped != NULL) {
712
0
        xmlBufWriteQuotedString(buf->buffer, escaped);
713
0
        xmlFree(escaped);
714
0
    } else {
715
0
        xmlBufWriteQuotedString(buf->buffer, value);
716
0
    }
717
0
      } else {
718
0
    xmlBufWriteQuotedString(buf->buffer, value);
719
0
      }
720
0
      xmlFree(value);
721
0
  } else  {
722
0
      xmlOutputBufferWriteString(buf, "=\"\"");
723
0
  }
724
0
    }
725
0
}
726
727
/**
728
 * htmlNodeDumpFormatOutput:
729
 * @buf:  the HTML buffer output
730
 * @doc:  the document
731
 * @cur:  the current node
732
 * @encoding:  the encoding string (unused)
733
 * @format:  should formatting spaces been added
734
 *
735
 * Dump an HTML node, recursive behaviour,children are printed too.
736
 */
737
void
738
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739
                   xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
740
0
                         int format) {
741
0
    xmlNodePtr root, parent;
742
0
    xmlAttrPtr attr;
743
0
    const htmlElemDesc * info;
744
745
0
    xmlInitParser();
746
747
0
    if ((cur == NULL) || (buf == NULL)) {
748
0
  return;
749
0
    }
750
751
0
    root = cur;
752
0
    parent = cur->parent;
753
0
    while (1) {
754
0
        switch (cur->type) {
755
0
        case XML_HTML_DOCUMENT_NODE:
756
0
        case XML_DOCUMENT_NODE:
757
0
            if (((xmlDocPtr) cur)->intSubset != NULL) {
758
0
                htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
759
0
            }
760
0
            if (cur->children != NULL) {
761
                /* Always validate cur->parent when descending. */
762
0
                if (cur->parent == parent) {
763
0
                    parent = cur;
764
0
                    cur = cur->children;
765
0
                    continue;
766
0
                }
767
0
            } else {
768
0
                xmlOutputBufferWriteString(buf, "\n");
769
0
            }
770
0
            break;
771
772
0
        case XML_ELEMENT_NODE:
773
            /*
774
             * Some users like lxml are known to pass nodes with a corrupted
775
             * tree structure. Fall back to a recursive call to handle this
776
             * case.
777
             */
778
0
            if ((cur->parent != parent) && (cur->children != NULL)) {
779
0
                htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
780
0
                break;
781
0
            }
782
783
            /*
784
             * Get specific HTML info for that node.
785
             */
786
0
            if (cur->ns == NULL)
787
0
                info = htmlTagLookup(cur->name);
788
0
            else
789
0
                info = NULL;
790
791
0
            xmlOutputBufferWriteString(buf, "<");
792
0
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
793
0
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
794
0
                xmlOutputBufferWriteString(buf, ":");
795
0
            }
796
0
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
797
0
            if (cur->nsDef)
798
0
                xmlNsListDumpOutput(buf, cur->nsDef);
799
0
            attr = cur->properties;
800
0
            while (attr != NULL) {
801
0
                htmlAttrDumpOutput(buf, doc, attr);
802
0
                attr = attr->next;
803
0
            }
804
805
0
            if ((info != NULL) && (info->empty)) {
806
0
                xmlOutputBufferWriteString(buf, ">");
807
0
            } else if (cur->children == NULL) {
808
0
                if ((info != NULL) && (info->saveEndTag != 0) &&
809
0
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
810
0
                    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
811
0
                    xmlOutputBufferWriteString(buf, ">");
812
0
                } else {
813
0
                    xmlOutputBufferWriteString(buf, "></");
814
0
                    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
815
0
                        xmlOutputBufferWriteString(buf,
816
0
                                (const char *)cur->ns->prefix);
817
0
                        xmlOutputBufferWriteString(buf, ":");
818
0
                    }
819
0
                    xmlOutputBufferWriteString(buf, (const char *)cur->name);
820
0
                    xmlOutputBufferWriteString(buf, ">");
821
0
                }
822
0
            } else {
823
0
                xmlOutputBufferWriteString(buf, ">");
824
0
                if ((format) && (info != NULL) && (!info->isinline) &&
825
0
                    (cur->children->type != HTML_TEXT_NODE) &&
826
0
                    (cur->children->type != HTML_ENTITY_REF_NODE) &&
827
0
                    (cur->children != cur->last) &&
828
0
                    (cur->name != NULL) &&
829
0
                    (cur->name[0] != 'p')) /* p, pre, param */
830
0
                    xmlOutputBufferWriteString(buf, "\n");
831
0
                parent = cur;
832
0
                cur = cur->children;
833
0
                continue;
834
0
            }
835
836
0
            if ((format) && (cur->next != NULL) &&
837
0
                (info != NULL) && (!info->isinline)) {
838
0
                if ((cur->next->type != HTML_TEXT_NODE) &&
839
0
                    (cur->next->type != HTML_ENTITY_REF_NODE) &&
840
0
                    (parent != NULL) &&
841
0
                    (parent->name != NULL) &&
842
0
                    (parent->name[0] != 'p')) /* p, pre, param */
843
0
                    xmlOutputBufferWriteString(buf, "\n");
844
0
            }
845
846
0
            break;
847
848
0
        case XML_ATTRIBUTE_NODE:
849
0
            htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
850
0
            break;
851
852
0
        case HTML_TEXT_NODE:
853
0
            if (cur->content == NULL)
854
0
                break;
855
0
            if (((cur->name == (const xmlChar *)xmlStringText) ||
856
0
                 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
857
0
                ((parent == NULL) ||
858
0
                 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
859
0
                  (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
860
0
                xmlChar *buffer;
861
862
0
                buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
863
0
                if (buffer != NULL) {
864
0
                    xmlOutputBufferWriteString(buf, (const char *)buffer);
865
0
                    xmlFree(buffer);
866
0
                }
867
0
            } else {
868
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
869
0
            }
870
0
            break;
871
872
0
        case HTML_COMMENT_NODE:
873
0
            if (cur->content != NULL) {
874
0
                xmlOutputBufferWriteString(buf, "<!--");
875
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
876
0
                xmlOutputBufferWriteString(buf, "-->");
877
0
            }
878
0
            break;
879
880
0
        case HTML_PI_NODE:
881
0
            if (cur->name != NULL) {
882
0
                xmlOutputBufferWriteString(buf, "<?");
883
0
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
884
0
                if (cur->content != NULL) {
885
0
                    xmlOutputBufferWriteString(buf, " ");
886
0
                    xmlOutputBufferWriteString(buf,
887
0
                            (const char *)cur->content);
888
0
                }
889
0
                xmlOutputBufferWriteString(buf, ">");
890
0
            }
891
0
            break;
892
893
0
        case HTML_ENTITY_REF_NODE:
894
0
            xmlOutputBufferWriteString(buf, "&");
895
0
            xmlOutputBufferWriteString(buf, (const char *)cur->name);
896
0
            xmlOutputBufferWriteString(buf, ";");
897
0
            break;
898
899
0
        case HTML_PRESERVE_NODE:
900
0
            if (cur->content != NULL) {
901
0
                xmlOutputBufferWriteString(buf, (const char *)cur->content);
902
0
            }
903
0
            break;
904
905
0
        default:
906
0
            break;
907
0
        }
908
909
0
        while (1) {
910
0
            if (cur == root)
911
0
                return;
912
0
            if (cur->next != NULL) {
913
0
                cur = cur->next;
914
0
                break;
915
0
            }
916
917
0
            cur = parent;
918
            /* cur->parent was validated when descending. */
919
0
            parent = cur->parent;
920
921
0
            if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
922
0
                (cur->type == XML_DOCUMENT_NODE)) {
923
0
                xmlOutputBufferWriteString(buf, "\n");
924
0
            } else {
925
0
                if ((format) && (cur->ns == NULL))
926
0
                    info = htmlTagLookup(cur->name);
927
0
                else
928
0
                    info = NULL;
929
930
0
                if ((format) && (info != NULL) && (!info->isinline) &&
931
0
                    (cur->last->type != HTML_TEXT_NODE) &&
932
0
                    (cur->last->type != HTML_ENTITY_REF_NODE) &&
933
0
                    (cur->children != cur->last) &&
934
0
                    (cur->name != NULL) &&
935
0
                    (cur->name[0] != 'p')) /* p, pre, param */
936
0
                    xmlOutputBufferWriteString(buf, "\n");
937
938
0
                xmlOutputBufferWriteString(buf, "</");
939
0
                if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940
0
                    xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941
0
                    xmlOutputBufferWriteString(buf, ":");
942
0
                }
943
0
                xmlOutputBufferWriteString(buf, (const char *)cur->name);
944
0
                xmlOutputBufferWriteString(buf, ">");
945
946
0
                if ((format) && (info != NULL) && (!info->isinline) &&
947
0
                    (cur->next != NULL)) {
948
0
                    if ((cur->next->type != HTML_TEXT_NODE) &&
949
0
                        (cur->next->type != HTML_ENTITY_REF_NODE) &&
950
0
                        (parent != NULL) &&
951
0
                        (parent->name != NULL) &&
952
0
                        (parent->name[0] != 'p')) /* p, pre, param */
953
0
                        xmlOutputBufferWriteString(buf, "\n");
954
0
                }
955
0
            }
956
0
        }
957
0
    }
958
0
}
959
960
/**
961
 * htmlNodeDumpOutput:
962
 * @buf:  the HTML buffer output
963
 * @doc:  the document
964
 * @cur:  the current node
965
 * @encoding:  the encoding string (unused)
966
 *
967
 * Dump an HTML node, recursive behaviour,children are printed too,
968
 * and formatting returns/spaces are added.
969
 */
970
void
971
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
972
0
             xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
973
0
    htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
974
0
}
975
976
/**
977
 * htmlDocContentDumpFormatOutput:
978
 * @buf:  the HTML buffer output
979
 * @cur:  the document
980
 * @encoding:  the encoding string (unused)
981
 * @format:  should formatting spaces been added
982
 *
983
 * Dump an HTML document.
984
 */
985
void
986
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
987
                         const char *encoding ATTRIBUTE_UNUSED,
988
0
                               int format) {
989
0
    int type = 0;
990
0
    if (cur) {
991
0
        type = cur->type;
992
0
        cur->type = XML_HTML_DOCUMENT_NODE;
993
0
    }
994
0
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
995
0
    if (cur)
996
0
        cur->type = (xmlElementType) type;
997
0
}
998
999
/**
1000
 * htmlDocContentDumpOutput:
1001
 * @buf:  the HTML buffer output
1002
 * @cur:  the document
1003
 * @encoding:  the encoding string (unused)
1004
 *
1005
 * Dump an HTML document. Formatting return/spaces are added.
1006
 */
1007
void
1008
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1009
0
                   const char *encoding ATTRIBUTE_UNUSED) {
1010
0
    htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1011
0
}
1012
1013
/************************************************************************
1014
 *                  *
1015
 *    Saving functions front-ends       *
1016
 *                  *
1017
 ************************************************************************/
1018
1019
/**
1020
 * htmlDocDump:
1021
 * @f:  the FILE*
1022
 * @cur:  the document
1023
 *
1024
 * Dump an HTML document to an open FILE.
1025
 *
1026
 * returns: the number of byte written or -1 in case of failure.
1027
 */
1028
int
1029
0
htmlDocDump(FILE *f, xmlDocPtr cur) {
1030
0
    xmlOutputBufferPtr buf;
1031
0
    xmlCharEncodingHandlerPtr handler = NULL;
1032
0
    const char *encoding;
1033
0
    int ret;
1034
1035
0
    xmlInitParser();
1036
1037
0
    if ((cur == NULL) || (f == NULL)) {
1038
0
  return(-1);
1039
0
    }
1040
1041
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
1042
1043
0
    if (encoding != NULL) {
1044
0
  xmlCharEncoding enc;
1045
1046
0
  enc = xmlParseCharEncoding(encoding);
1047
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1048
0
      handler = xmlFindCharEncodingHandler(encoding);
1049
0
      if (handler == NULL)
1050
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1051
0
  }
1052
0
    } else {
1053
        /*
1054
         * Fallback to HTML or ASCII when the encoding is unspecified
1055
         */
1056
0
        if (handler == NULL)
1057
0
            handler = xmlFindCharEncodingHandler("HTML");
1058
0
        if (handler == NULL)
1059
0
            handler = xmlFindCharEncodingHandler("ascii");
1060
0
    }
1061
1062
0
    buf = xmlOutputBufferCreateFile(f, handler);
1063
0
    if (buf == NULL) return(-1);
1064
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1065
1066
0
    ret = xmlOutputBufferClose(buf);
1067
0
    return(ret);
1068
0
}
1069
1070
/**
1071
 * htmlSaveFile:
1072
 * @filename:  the filename (or URL)
1073
 * @cur:  the document
1074
 *
1075
 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1076
 * used.
1077
 * returns: the number of byte written or -1 in case of failure.
1078
 */
1079
int
1080
0
htmlSaveFile(const char *filename, xmlDocPtr cur) {
1081
0
    xmlOutputBufferPtr buf;
1082
0
    xmlCharEncodingHandlerPtr handler = NULL;
1083
0
    const char *encoding;
1084
0
    int ret;
1085
1086
0
    if ((cur == NULL) || (filename == NULL))
1087
0
        return(-1);
1088
1089
0
    xmlInitParser();
1090
1091
0
    encoding = (const char *) htmlGetMetaEncoding(cur);
1092
1093
0
    if (encoding != NULL) {
1094
0
  xmlCharEncoding enc;
1095
1096
0
  enc = xmlParseCharEncoding(encoding);
1097
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1098
0
      handler = xmlFindCharEncodingHandler(encoding);
1099
0
      if (handler == NULL)
1100
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1101
0
  }
1102
0
    } else {
1103
        /*
1104
         * Fallback to HTML or ASCII when the encoding is unspecified
1105
         */
1106
0
        if (handler == NULL)
1107
0
            handler = xmlFindCharEncodingHandler("HTML");
1108
0
        if (handler == NULL)
1109
0
            handler = xmlFindCharEncodingHandler("ascii");
1110
0
    }
1111
1112
    /*
1113
     * save the content to a temp buffer.
1114
     */
1115
0
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1116
0
    if (buf == NULL) return(0);
1117
1118
0
    htmlDocContentDumpOutput(buf, cur, NULL);
1119
1120
0
    ret = xmlOutputBufferClose(buf);
1121
0
    return(ret);
1122
0
}
1123
1124
/**
1125
 * htmlSaveFileFormat:
1126
 * @filename:  the filename
1127
 * @cur:  the document
1128
 * @format:  should formatting spaces been added
1129
 * @encoding: the document encoding
1130
 *
1131
 * Dump an HTML document to a file using a given encoding.
1132
 *
1133
 * returns: the number of byte written or -1 in case of failure.
1134
 */
1135
int
1136
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1137
0
             const char *encoding, int format) {
1138
0
    xmlOutputBufferPtr buf;
1139
0
    xmlCharEncodingHandlerPtr handler = NULL;
1140
0
    int ret;
1141
1142
0
    if ((cur == NULL) || (filename == NULL))
1143
0
        return(-1);
1144
1145
0
    xmlInitParser();
1146
1147
0
    if (encoding != NULL) {
1148
0
  xmlCharEncoding enc;
1149
1150
0
  enc = xmlParseCharEncoding(encoding);
1151
0
  if (enc != XML_CHAR_ENCODING_UTF8) {
1152
0
      handler = xmlFindCharEncodingHandler(encoding);
1153
0
      if (handler == NULL)
1154
0
    htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1155
0
  }
1156
0
        htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1157
0
    } else {
1158
0
  htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1159
1160
        /*
1161
         * Fallback to HTML or ASCII when the encoding is unspecified
1162
         */
1163
0
        if (handler == NULL)
1164
0
            handler = xmlFindCharEncodingHandler("HTML");
1165
0
        if (handler == NULL)
1166
0
            handler = xmlFindCharEncodingHandler("ascii");
1167
0
    }
1168
1169
    /*
1170
     * save the content to a temp buffer.
1171
     */
1172
0
    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1173
0
    if (buf == NULL) return(0);
1174
1175
0
    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1176
1177
0
    ret = xmlOutputBufferClose(buf);
1178
0
    return(ret);
1179
0
}
1180
1181
/**
1182
 * htmlSaveFileEnc:
1183
 * @filename:  the filename
1184
 * @cur:  the document
1185
 * @encoding: the document encoding
1186
 *
1187
 * Dump an HTML document to a file using a given encoding
1188
 * and formatting returns/spaces are added.
1189
 *
1190
 * returns: the number of byte written or -1 in case of failure.
1191
 */
1192
int
1193
0
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1194
0
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1195
0
}
1196
1197
#endif /* LIBXML_OUTPUT_ENABLED */
1198
1199
#endif /* LIBXML_HTML_ENABLED */