Coverage Report

Created: 2024-05-20 06:23

/src/mupdf/source/fitz/xml.c
Line
Count
Source (jump to first uncovered line)
1
// Copyright (C) 2004-2022 Artifex Software, Inc.
2
//
3
// This file is part of MuPDF.
4
//
5
// MuPDF is free software: you can redistribute it and/or modify it under the
6
// terms of the GNU Affero General Public License as published by the Free
7
// Software Foundation, either version 3 of the License, or (at your option)
8
// any later version.
9
//
10
// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12
// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13
// details.
14
//
15
// You should have received a copy of the GNU Affero General Public License
16
// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17
//
18
// Alternative licensing terms are available from the licensor.
19
// For commercial licensing, see <https://www.artifex.com/> or contact
20
// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21
// CA 94129, USA, for further information.
22
23
#include "xml-imp.h"
24
25
#include <string.h>
26
#include <stdlib.h>
27
#include <stdio.h>
28
29
#include <gumbo.h>
30
31
33.9k
#define FZ_XML_MAX_DEPTH 4096
32
33
/* #define FZ_XML_SEQ */
34
35
static const struct { const char *name; int c; } html_entities[] = {
36
  {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
37
  {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
38
  {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
39
  {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
40
  {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
41
  {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
42
  {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
43
  {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
44
  {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
45
  {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
46
  {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
47
  {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
48
  {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
49
  {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
50
  {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
51
  {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
52
  {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
53
  {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
54
  {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
55
  {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
56
  {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
57
  {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
58
  {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
59
  {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
60
  {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
61
  {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
62
  {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
63
  {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
64
  {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
65
  {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
66
  {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
67
  {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
68
  {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
69
  {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
70
  {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
71
  {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
72
  {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
73
  {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
74
  {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
75
  {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
76
  {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
77
  {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
78
  {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
79
  {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
80
  {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
81
  {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
82
  {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
83
  {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
84
  {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
85
  {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
86
  {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
87
  {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
88
  {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
89
  {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
90
  {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
91
  {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
92
  {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
93
  {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
94
  {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
95
  {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
96
};
97
98
struct parser
99
{
100
  fz_pool *pool;
101
  fz_xml *head;
102
  int preserve_white;
103
  int depth;
104
#ifdef FZ_XML_SEQ
105
  int seq;
106
#endif
107
};
108
109
static void xml_indent(fz_context *ctx, fz_output *out, int n)
110
0
{
111
0
  while (n--) {
112
0
    fz_write_byte(ctx, out, ' ');
113
0
    fz_write_byte(ctx, out, ' ');
114
0
  }
115
0
}
116
117
void fz_debug_xml(fz_xml *item, int level)
118
0
{
119
  /* This is a bit nasty as it relies on implementation
120
   * details of both fz_stdout, and fz_write_printf coping
121
   * with NULL ctx. */
122
0
  fz_output_xml(NULL, fz_stdout(NULL), item, level);
123
0
}
124
125
void fz_output_xml(fz_context *ctx, fz_output *out, fz_xml *item, int level)
126
0
{
127
0
  char *s;
128
129
0
  if (item == NULL)
130
0
    return;
131
132
  /* Skip over the DOC object at the top. */
133
0
  if (item->up == NULL)
134
0
  {
135
0
    fz_xml *child;
136
0
    for (child = fz_xml_down(item); child; child = child->u.node.next)
137
0
      fz_output_xml(ctx, out, child, level + 1);
138
0
    return;
139
0
  }
140
141
0
  s = fz_xml_text(item);
142
0
  xml_indent(ctx, out, level);
143
0
  if (s)
144
0
  {
145
0
    int c;
146
0
    fz_write_byte(ctx, out, '"');
147
0
    while (*s) {
148
0
      s += fz_chartorune(&c, s);
149
0
      switch (c) {
150
0
      default:
151
0
        if (c > 0xFFFF)
152
0
          fz_write_printf(ctx, out, "\\u{%X}", c);
153
0
        else if (c < 32 || c > 127)
154
0
          fz_write_printf(ctx, out, "\\u%04X", c);
155
0
        else
156
0
          fz_write_byte(ctx, out, c);
157
0
        break;
158
0
      case '\\': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, '\\'); break;
159
0
      case '\b': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'b'); break;
160
0
      case '\f': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'f'); break;
161
0
      case '\n': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'n'); break;
162
0
      case '\r': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 'r'); break;
163
0
      case '\t': fz_write_byte(ctx, out, '\\'); fz_write_byte(ctx, out, 't'); break;
164
0
      }
165
0
    }
166
0
    fz_write_byte(ctx, out, '"');
167
#ifdef FZ_XML_SEQ
168
    fz_write_printf(ctx, out, " <%d>", item->seq);
169
#endif
170
0
    fz_write_byte(ctx, out, '\n');
171
0
  }
172
0
  else
173
0
  {
174
0
    fz_xml *child;
175
0
    struct attribute *att;
176
177
#ifdef FZ_XML_SEQ
178
    fz_write_printf(ctx, out, "(%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
179
#else
180
0
    fz_write_printf(ctx, out, "(%s\n", item->u.node.u.d.name);
181
0
#endif
182
0
    for (att = item->u.node.u.d.atts; att; att = att->next)
183
0
    {
184
0
      xml_indent(ctx, out, level);
185
0
      fz_write_printf(ctx, out, "=%s %s\n", att->name, att->value);
186
0
    }
187
0
    for (child = fz_xml_down(item); child; child = child->u.node.next)
188
0
      fz_output_xml(ctx, out, child, level + 1);
189
0
    xml_indent(ctx, out, level);
190
#ifdef FZ_XML_SEQ
191
    fz_write_printf(ctx, out, ")%s <%d>\n", item->u.node.u.d.name, item->u.node.seq);
192
#else
193
0
    fz_write_printf(ctx, out, ")%s\n", item->u.node.u.d.name);
194
0
#endif
195
0
  }
196
0
}
197
198
fz_xml *fz_xml_prev(fz_xml *item)
199
0
{
200
0
  return item && item->up ? item->u.node.prev : NULL;
201
0
}
202
203
fz_xml *fz_xml_next(fz_xml *item)
204
27.7k
{
205
27.7k
  return item && item->up ? item->u.node.next : NULL;
206
27.7k
}
207
208
fz_xml *fz_xml_up(fz_xml *item)
209
174k
{
210
  /* Never step up to the DOC. */
211
174k
  return item && item->up && item->up->up ? item->up : NULL;
212
174k
}
213
214
fz_xml *fz_xml_down(fz_xml *item)
215
19.8k
{
216
  /* DOC items can never have MAGIC_TEXT as their down value,
217
   * so this is safe. */
218
19.8k
  return item && !FZ_TEXT_ITEM(item) ? item->down : NULL;
219
19.8k
}
220
221
char *fz_xml_text(fz_xml *item)
222
10.6k
{
223
  /* DOC items can never have MAGIC_TEXT as their down value,
224
   * so this is safe. */
225
10.6k
  return (item && FZ_TEXT_ITEM(item)) ? item->u.node.u.text : NULL;
226
10.6k
}
227
228
char *fz_xml_tag(fz_xml *item)
229
12.6k
{
230
  /* DOC items can never have MAGIC_TEXT as their down value,
231
   * so this is safe. */
232
12.6k
  return item && !FZ_TEXT_ITEM(item) ? item->u.node.u.d.name : NULL;
233
12.6k
}
234
235
int fz_xml_is_tag(fz_xml *item, const char *name)
236
222k
{
237
222k
  if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
238
5.82k
    return 0;
239
216k
  return !strcmp(item->u.node.u.d.name, name);
240
222k
}
241
242
char *fz_xml_att(fz_xml *item, const char *name)
243
174k
{
244
174k
  struct attribute *att;
245
174k
  if (!item || FZ_DOCUMENT_ITEM(item) || FZ_TEXT_ITEM(item))
246
1.13k
    return NULL;
247
300k
  for (att = item->u.node.u.d.atts; att; att = att->next)
248
130k
    if (!strcmp(att->name, name))
249
3.74k
      return att->value;
250
170k
  return NULL;
251
173k
}
252
253
char *fz_xml_att_alt(fz_xml *item, const char *one, const char *two)
254
5.68k
{
255
5.68k
  char *val = fz_xml_att(item, one);
256
5.68k
  if (!val)
257
5.68k
    val = fz_xml_att(item, two);
258
5.68k
  return val;
259
5.68k
}
260
261
fz_xml *fz_xml_find(fz_xml *item, const char *tag)
262
279
{
263
  /* Skip over any DOC item. */
264
279
  if (item && FZ_DOCUMENT_ITEM(item))
265
0
    item = item->down;
266
267
302
  while (item)
268
238
  {
269
238
    if (!FZ_TEXT_ITEM(item) && !strcmp(item->u.node.u.d.name, tag))
270
215
      return item;
271
23
    item = item->u.node.next;
272
23
  }
273
64
  return NULL;
274
279
}
275
276
fz_xml *fz_xml_find_next(fz_xml *item, const char *tag)
277
0
{
278
  /* Skip over any DOC item. */
279
0
  if (item && FZ_DOCUMENT_ITEM(item))
280
0
    item = item->down;
281
282
0
  if (item)
283
0
    item = item->u.node.next;
284
0
  return fz_xml_find(item, tag);
285
0
}
286
287
fz_xml *fz_xml_find_down(fz_xml *item, const char *tag)
288
159
{
289
159
  if (item)
290
159
    item = fz_xml_down(item);
291
159
  return fz_xml_find(item, tag);
292
159
}
293
294
int fz_xml_att_eq(fz_xml *item, const char *name, const char *match)
295
424
{
296
424
  const char *val = fz_xml_att(item, name);
297
298
424
  return val ? !strcmp(val, match) : 0;
299
424
}
300
301
fz_xml *fz_xml_find_match(fz_xml *item, const char *tag, const char *att, const char *match)
302
0
{
303
  /* Skip over any document item. */
304
0
  if (item && FZ_DOCUMENT_ITEM(item))
305
0
    item = item->down;
306
307
0
  while (1)
308
0
  {
309
0
    item = tag ? fz_xml_find(item, tag) : item;
310
0
    if (item == NULL || fz_xml_att_eq(item, att, match))
311
0
      break;
312
0
    item = item->u.node.next;
313
0
  }
314
315
0
  return item;
316
0
}
317
318
fz_xml *fz_xml_find_next_match(fz_xml *item, const char *tag, const char *att, const char *match)
319
0
{
320
  /* Skip over any document item. */
321
0
  if (item && FZ_DOCUMENT_ITEM(item))
322
0
    item = item->down;
323
324
0
  if (item != NULL)
325
0
  {
326
0
    do
327
0
    {
328
0
      item = tag ? fz_xml_find_next(item, tag) : item->u.node.next;
329
0
    }
330
0
    while (item != NULL && !fz_xml_att_eq(item, att, match));
331
0
  }
332
333
0
  return item;
334
0
}
335
336
fz_xml *fz_xml_find_down_match(fz_xml *item, const char *tag, const char *att, const char *match)
337
0
{
338
0
  return fz_xml_find_match(fz_xml_down(item), tag, att, match);
339
0
}
340
341
fz_xml *fz_xml_root(fz_xml *xml)
342
413
{
343
413
  if (xml == NULL)
344
0
    return NULL;
345
346
  /* If we've been given a node mid-tree, run up to the root to find
347
   * the doc node. */
348
413
  while (xml->up)
349
0
    xml = xml->up;
350
351
  /* And the root is the child of the doc.*/
352
413
  return xml->down;
353
413
}
354
355
void fz_drop_xml(fz_context *ctx, fz_xml *xml)
356
43.9k
{
357
43.9k
  if (!xml)
358
43.3k
    return;
359
360
  /* Whereever we are in the tree, we want the doc node at the root. */
361
610
  while (xml->up)
362
0
    xml = xml->up;
363
364
  /* Drop a reference to the tree as a whole. */
365
610
  if (fz_drop_imp(ctx, xml, &xml->u.doc.refs) == 0)
366
0
    return;
367
368
610
  fz_drop_pool(ctx, xml->u.doc.pool);
369
610
}
370
371
void fz_detach_xml(fz_context *ctx, fz_xml *node)
372
0
{
373
0
  fz_xml *doc = node;
374
375
  /* If we're already a document node, then this is a NOP. */
376
0
  if (doc->up == NULL)
377
0
    return;
378
379
  /* Move doc to be the doc pointer at the top of the tree. */
380
0
  while (doc->up)
381
0
  {
382
0
    doc = doc->up;
383
0
  }
384
385
  /* Relocate node to be the child of doc. */
386
0
  node->up->down = NULL;
387
0
  doc->down = node;
388
389
  /* NOTE: Suppose that X = doc->down on entry. On exit doc->down == node, but
390
   * X->up = doc. We need to be careful throughout this code to not assume that
391
   * Y is always a child of Y->up. */
392
0
}
393
394
size_t xml_parse_entity(int *c, const char *a)
395
1.72k
{
396
1.72k
  char *b;
397
1.72k
  size_t i;
398
399
1.72k
  if (a[1] == '#') {
400
6
    if (a[2] == 'x')
401
1
      *c = strtol(a + 3, &b, 16);
402
5
    else
403
5
      *c = strtol(a + 2, &b, 10);
404
6
    if (*b == ';')
405
1
      return b - a + 1;
406
6
  }
407
1.71k
  else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
408
7
    *c = '<';
409
7
    return 4;
410
7
  }
411
1.71k
  else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
412
3
    *c = '>';
413
3
    return 4;
414
3
  }
415
1.70k
  else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
416
1
    *c = '&';
417
1
    return 5;
418
1
  }
419
1.70k
  else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
420
0
    *c = '\'';
421
0
    return 6;
422
0
  }
423
1.70k
  else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
424
0
    *c = '"';
425
0
    return 6;
426
0
  }
427
428
  /* We should only be doing this for XHTML, but it shouldn't be a problem. */
429
434k
  for (i = 0; i < nelem(html_entities); ++i) {
430
432k
    size_t n = strlen(html_entities[i].name);
431
432k
    if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') {
432
0
      *c = html_entities[i].c;
433
0
      return n + 2;
434
0
    }
435
432k
  }
436
437
1.71k
  *c = *a;
438
1.71k
  return 1;
439
1.71k
}
440
441
static inline int isname(int c)
442
473k
{
443
473k
  return c == '.' || c == '-' || c == '_' || c == ':' ||
444
473k
    (c >= '0' && c <= '9') ||
445
473k
    (c >= 'A' && c <= 'Z') ||
446
473k
    (c >= 'a' && c <= 'z');
447
473k
}
448
449
static inline int iswhite(int c)
450
175k
{
451
175k
  return c == ' ' || c == '\r' || c == '\n' || c == '\t';
452
175k
}
453
454
static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, const char *a, const char *b, int is_text)
455
33.9k
{
456
33.9k
  fz_xml *head, *tail;
457
33.9k
  const char *ns;
458
33.9k
  size_t size;
459
460
33.9k
  if (is_text)
461
8.71k
    size = offsetof(fz_xml, u.node.u.text) + b-a+1;
462
25.2k
  else
463
25.2k
  {
464
    /* skip namespace prefix */
465
143k
    for (ns = a; ns < b - 1; ++ns)
466
118k
      if (*ns == ':')
467
18.6k
        a = ns + 1;
468
469
25.2k
    size = offsetof(fz_xml, u.node.u.d.name) + b-a+1;
470
25.2k
  }
471
33.9k
  head = fz_pool_alloc(ctx, parser->pool, size);
472
473
33.9k
  if (is_text)
474
8.71k
    head->down = MAGIC_TEXT;
475
25.2k
  else
476
25.2k
  {
477
25.2k
    memcpy(head->u.node.u.d.name, a, b - a);
478
25.2k
    head->u.node.u.d.name[b - a] = 0;
479
25.2k
    head->u.node.u.d.atts = NULL;
480
25.2k
    head->down = NULL;
481
25.2k
  }
482
483
33.9k
  head->up = parser->head;
484
33.9k
  head->u.node.next = NULL;
485
#ifdef FZ_XML_SEQ
486
  head->u.node.seq = parser->seq++;
487
#endif
488
489
  /* During construction, we use head->next to mean "the
490
   * tail of the children. When we close the tag, we
491
   * rewrite it to be NULL. */
492
33.9k
  if (!parser->head->down) {
493
13.0k
    parser->head->down = head;
494
13.0k
    parser->head->u.node.next = head;
495
13.0k
    head->u.node.prev = NULL;
496
13.0k
  }
497
20.9k
  else {
498
20.9k
    tail = parser->head->u.node.next;
499
20.9k
    tail->u.node.next = head;
500
20.9k
    head->u.node.prev = tail;
501
20.9k
    parser->head->u.node.next = head;
502
20.9k
  }
503
504
33.9k
  parser->head = head;
505
33.9k
  parser->depth++;
506
33.9k
  if (parser->depth >= FZ_XML_MAX_DEPTH)
507
0
    fz_throw(ctx, FZ_ERROR_SYNTAX, "too deep xml element nesting");
508
33.9k
}
509
510
static void xml_emit_att_name(fz_context *ctx, struct parser *parser, const char *a, const char *b)
511
24.9k
{
512
24.9k
  fz_xml *head = parser->head;
513
24.9k
  struct attribute *att;
514
24.9k
  size_t size;
515
516
24.9k
  size = offsetof(struct attribute, name) + b-a+1;
517
24.9k
  att = fz_pool_alloc(ctx, parser->pool, size);
518
24.9k
  memcpy(att->name, a, b - a);
519
24.9k
  att->name[b - a] = 0;
520
24.9k
  att->value = NULL;
521
24.9k
  att->next = head->u.node.u.d.atts;
522
24.9k
  head->u.node.u.d.atts = att;
523
24.9k
}
524
525
void fz_xml_add_att(fz_context *ctx, fz_pool *pool, fz_xml *node, const char *key, const char *val)
526
0
{
527
0
  size_t size = offsetof(struct attribute, name) + strlen(key) + 1;
528
0
  struct attribute *att = fz_pool_alloc(ctx, pool, size);
529
0
  memcpy(att->name, key, strlen(key)+1);
530
0
  att->value = fz_pool_alloc(ctx, pool, strlen(val) + 1);
531
0
  memcpy(att->value, val, strlen(val)+1);
532
0
  att->next = node->u.node.u.d.atts;
533
0
  node->u.node.u.d.atts = att;
534
0
}
535
536
static void xml_emit_att_value(fz_context *ctx, struct parser *parser, const char *a, const char *b)
537
24.9k
{
538
24.9k
  fz_xml *head = parser->head;
539
24.9k
  struct attribute *att = head->u.node.u.d.atts;
540
24.9k
  char *s;
541
24.9k
  int c;
542
543
  /* entities are all longer than UTFmax so runetochar is safe */
544
24.9k
  s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1);
545
237k
  while (a < b) {
546
212k
    if (*a == '&') {
547
131
      a += xml_parse_entity(&c, a);
548
131
      s += fz_runetochar(s, c);
549
131
    }
550
212k
    else {
551
212k
      *s++ = *a++;
552
212k
    }
553
212k
  }
554
24.9k
  *s = 0;
555
24.9k
}
556
557
static void xml_emit_close_tag(fz_context *ctx, struct parser *parser)
558
31.6k
{
559
31.6k
  parser->depth--;
560
31.6k
  parser->head->u.node.next = NULL;
561
31.6k
  if (parser->head->up)
562
31.6k
    parser->head = parser->head->up;
563
31.6k
}
564
565
static void xml_emit_text(fz_context *ctx, struct parser *parser, const char *a, const char *b)
566
9.11k
{
567
9.11k
  fz_xml *head;
568
9.11k
  const char *p;
569
9.11k
  char *s;
570
9.11k
  int c;
571
572
  /* Skip text outside the root tag */
573
9.11k
  if (parser->depth == 0)
574
249
    return;
575
576
  /* Skip all-whitespace text nodes */
577
8.86k
  if (!parser->preserve_white)
578
1.05k
  {
579
1.76k
    for (p = a; p < b; p++)
580
1.60k
      if (!iswhite(*p))
581
900
        break;
582
1.05k
    if (p == b)
583
155
      return;
584
1.05k
  }
585
586
8.71k
  xml_emit_open_tag(ctx, parser, a, b, 1);
587
8.71k
  head = parser->head;
588
589
  /* entities are all longer than UTFmax so runetochar is safe */
590
8.71k
  s = fz_xml_text(head);
591
279k
  while (a < b) {
592
271k
    if (*a == '&') {
593
1.59k
      a += xml_parse_entity(&c, a);
594
1.59k
      s += fz_runetochar(s, c);
595
1.59k
    }
596
269k
    else {
597
269k
      *s++ = *a++;
598
269k
    }
599
271k
  }
600
8.71k
  *s = 0;
601
602
8.71k
  xml_emit_close_tag(ctx, parser);
603
8.71k
}
604
605
static void xml_emit_cdata(fz_context *ctx, struct parser *parser, const char *a, const char *b)
606
0
{
607
0
  fz_xml *head;
608
0
  char *s;
609
610
0
  xml_emit_open_tag(ctx, parser, a, b, 1);
611
0
  head = parser->head;
612
613
0
  s = head->u.node.u.text;
614
0
  while (a < b)
615
0
    *s++ = *a++;
616
0
  *s = 0;
617
618
0
  xml_emit_close_tag(ctx, parser);
619
0
}
620
621
static int close_tag(fz_context *ctx, struct parser *parser, const char *mark, const char *p)
622
9.82k
{
623
9.82k
  const char *ns, *tag;
624
625
  /* skip namespace prefix */
626
54.1k
  for (ns = mark; ns < p - 1; ++ns)
627
44.3k
    if (*ns == ':')
628
9.09k
      mark = ns + 1;
629
630
9.82k
  tag = fz_xml_tag(parser->head);
631
9.82k
  if (tag && strncmp(tag, mark, p-mark) == 0 && tag[p-mark] == 0)
632
9.82k
  {
633
9.82k
    xml_emit_close_tag(ctx, parser);
634
9.82k
    return 0;
635
9.82k
  }
636
7
  return 1;
637
9.82k
}
638
639
static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, const char *p) /* lgtm [cpp/use-of-goto] */
640
634
{
641
634
  const char *mark;
642
634
  int quote;
643
644
35.5k
parse_text:
645
35.5k
  mark = p;
646
444k
  while (*p && *p != '<') ++p;
647
35.5k
  if (*p == '<') {
648
34.9k
    if (mark < p)
649
8.12k
      xml_emit_text(ctx, parser, mark, p);
650
34.9k
    ++p;
651
34.9k
    goto parse_element;
652
34.9k
  } else if (mark < p)
653
357
    xml_emit_text(ctx, parser, mark, p);
654
557
  return NULL;
655
656
34.9k
parse_element:
657
34.9k
  if (*p == '/') { ++p; goto parse_closing_element; }
658
25.1k
  if (*p == '!') { ++p; goto parse_comment; }
659
25.1k
  if (*p == '?') { ++p; goto parse_processing_instruction; }
660
24.6k
  while (iswhite(*p)) ++p;
661
24.6k
  if (isname(*p))
662
24.6k
    goto parse_element_name;
663
6
  return "syntax error in element";
664
665
27
parse_comment:
666
27
  if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E')
667
19
    goto parse_declaration;
668
8
  if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y')
669
0
    goto parse_declaration;
670
8
  if (*p == '[') goto parse_cdata;
671
8
  if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
672
5
  if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
673
20
  while (*p) {
674
20
    if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
675
5
      p += 3;
676
5
      goto parse_text;
677
5
    }
678
15
    ++p;
679
15
  }
680
0
  return "end of data in comment";
681
682
19
parse_declaration:
683
1.02k
  while (*p) if (*p++ == '>') goto parse_text;
684
1
  return "end of data in declaration";
685
686
0
parse_cdata:
687
0
  if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
688
0
    return "syntax error in CDATA section";
689
0
  p += 7;
690
0
  mark = p;
691
0
  while (*p) {
692
0
    if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
693
0
      xml_emit_cdata(ctx, parser, mark, p);
694
0
      p += 3;
695
0
      goto parse_text;
696
0
    }
697
0
    ++p;
698
0
  }
699
0
  return "end of data in CDATA section";
700
701
457
parse_processing_instruction:
702
12.3k
  while (*p) {
703
12.3k
    if (p[0] == '?' && p[1] == '>') {
704
455
      p += 2;
705
455
      goto parse_text;
706
455
    }
707
11.8k
    ++p;
708
11.8k
  }
709
2
  return "end of data in processing instruction";
710
711
9.82k
parse_closing_element:
712
9.82k
  while (iswhite(*p)) ++p;
713
9.82k
  mark = p;
714
63.9k
  while (isname(*p)) ++p;
715
9.82k
  if (!isname(*mark))
716
0
    return "syntax error in closing element";
717
9.82k
  if (close_tag(ctx, parser, mark, p))
718
7
    return "opening and closing tag mismatch";
719
9.82k
  while (iswhite(*p)) ++p;
720
9.82k
  if (*p != '>')
721
1
    return "syntax error in closing element";
722
9.81k
  ++p;
723
9.81k
  goto parse_text;
724
725
24.6k
parse_element_name:
726
24.6k
  mark = p;
727
166k
  while (isname(*p)) ++p;
728
24.6k
  xml_emit_open_tag(ctx, parser, mark, p, 0);
729
24.6k
  if (*p == '>') {
730
9.07k
    ++p;
731
9.07k
    goto parse_text;
732
9.07k
  }
733
15.5k
  if (p[0] == '/' && p[1] == '>') {
734
1.50k
    xml_emit_close_tag(ctx, parser);
735
1.50k
    p += 2;
736
1.50k
    goto parse_text;
737
1.50k
  }
738
14.0k
  if (iswhite(*p))
739
14.0k
    goto parse_attributes;
740
18
  return "syntax error after element name";
741
742
38.6k
parse_attributes:
743
66.3k
  while (iswhite(*p)) ++p;
744
38.6k
  if (isname(*p))
745
24.6k
    goto parse_attribute_name;
746
14.0k
  if (*p == '>') {
747
2.96k
    ++p;
748
2.96k
    goto parse_text;
749
2.96k
  }
750
11.0k
  if (p[0] == '/' && p[1] == '>') {
751
11.0k
    xml_emit_close_tag(ctx, parser);
752
11.0k
    p += 2;
753
11.0k
    goto parse_text;
754
11.0k
  }
755
9
  return "syntax error in attributes";
756
757
24.6k
parse_attribute_name:
758
24.6k
  mark = p;
759
169k
  while (isname(*p)) ++p;
760
24.6k
  xml_emit_att_name(ctx, parser, mark, p);
761
24.9k
  while (iswhite(*p)) ++p;
762
24.6k
  if (*p == '=') { ++p; goto parse_attribute_value; }
763
22
  return "syntax error after attribute name";
764
765
24.6k
parse_attribute_value:
766
24.6k
  while (iswhite(*p)) ++p;
767
24.6k
  quote = *p++;
768
24.6k
  mark = p;
769
770
  /* special case for handling MOBI filepos=00000 syntax */
771
24.6k
  if (quote >= '0' && quote <= '9') {
772
97
    while (*p >= '0' && *p <= '9') ++p;
773
7
    xml_emit_att_value(ctx, parser, mark, p);
774
7
    goto parse_attributes;
775
7
  }
776
777
24.6k
  if (quote != '"' && quote != '\'')
778
0
    return "missing quote character";
779
229k
  while (*p && *p != quote) ++p;
780
24.6k
  if (*p == quote) {
781
24.6k
    xml_emit_att_value(ctx, parser, mark, p++);
782
24.6k
    goto parse_attributes;
783
24.6k
  }
784
8
  return "end of data in attribute value";
785
24.6k
}
786
787
static int fast_tolower(int c)
788
1.60M
{
789
1.60M
  if ((unsigned)c - 'A' < 26)
790
76.3k
    return c | 32;
791
1.52M
  return c;
792
1.60M
}
793
794
static int fast_strncasecmp(const char *a, const char *b, size_t n)
795
56.7k
{
796
56.7k
  if (!n--)
797
0
    return 0;
798
57.2k
  for (; *a && *b && n && fast_tolower(*a) == fast_tolower(*b); a++, b++, n--)
799
499
    ;
800
56.7k
  return fast_tolower(*a) - fast_tolower(*b);
801
56.7k
}
802
803
static char *fast_strcasestr(char *h, char *n)
804
687
{
805
687
  int n0 = fast_tolower(*n++);
806
687
  size_t nn = strlen(n);
807
1.37M
  while (*h != 0)
808
1.37M
  {
809
1.37M
    if (fast_tolower(*h) == n0 && fast_strncasecmp(h+1, n, nn) == 0)
810
0
      return h;
811
1.37M
    ++h;
812
1.37M
  }
813
687
  return NULL;
814
687
}
815
816
static int startswith(const char *a, const char *b)
817
12.3k
{
818
12.3k
  return !fast_strncasecmp(a, b, strlen(b));
819
12.3k
}
820
821
/* https://encoding.spec.whatwg.org/#names-and-labels */
822
static struct { char *encoding; char *alias; } encoding_aliases[] = {
823
  { "big5", "big5" },
824
  { "big5", "big5-hkscs" },
825
  { "big5", "cn-big5" },
826
  { "big5", "csbig5" },
827
  { "big5", "x-x-big5" },
828
  { "euc-cn", "euc-cn" },
829
  { "euc-jp", "cseucpkdfmtjapanese" },
830
  { "euc-jp", "euc-jp" },
831
  { "euc-jp", "x-euc-jp" },
832
  { "euc-kr", "cseuckr" },
833
  { "euc-kr", "csksc56011987" },
834
  { "euc-kr", "euc-kr" },
835
  { "euc-kr", "iso-ir-149" },
836
  { "euc-kr", "korean" },
837
  { "euc-kr", "ks_c_5601" },
838
  { "euc-kr", "ksc5601" },
839
  { "euc-kr", "ksc_5601" },
840
  { "euc-kr", "windows-949" },
841
  { "euc-tw", "euc-tw" },
842
  { "gb18030", "chinese" },
843
  { "gb18030", "csgb2312" },
844
  { "gb18030", "csiso58gb231280" },
845
  { "gb18030", "gb18030" },
846
  { "gb18030", "gb2312" },
847
  { "gb18030", "gb_2312" },
848
  { "gb18030", "gbk" },
849
  { "gb18030", "iso-ir-58" },
850
  { "gb18030", "x-gbk" },
851
  { "iso-8859-1", "ascii" },
852
  { "iso-8859-1", "iso-8859-1" },
853
  { "iso-8859-1", "iso8859-1" },
854
  { "iso-8859-1", "latin1" },
855
  { "iso-8859-1", "us-ascii" },
856
  { "iso-8859-7", "greek" },
857
  { "iso-8859-7", "greek8" },
858
  { "iso-8859-7", "iso-8859-1" },
859
  { "iso-8859-7", "iso8859-1" },
860
  { "koi8-r", "koi" },
861
  { "koi8-r", "koi8" },
862
  { "koi8-r", "koi8-r" },
863
  { "koi8-r", "koi8-ru" },
864
  { "koi8-r", "koi8-u" },
865
  { "koi8-r", "koi8_r" },
866
  { "shift_jis", "csshiftjis" },
867
  { "shift_jis", "ms932" },
868
  { "shift_jis", "ms_kanji" },
869
  { "shift_jis", "shift-jis" },
870
  { "shift_jis", "shift_jis" },
871
  { "shift_jis", "sjis" },
872
  { "shift_jis", "windows-31j" },
873
  { "shift_jis", "x-sjis" },
874
  { "windows-1250", "cp1250" },
875
  { "windows-1250", "windows-1250" },
876
  { "windows-1251", "cp1251" },
877
  { "windows-1251", "windows-1251" },
878
  { "windows-1252", "cp1252" },
879
  { "windows-1252", "cp819" },
880
  { "windows-1252", "windows-1252" },
881
};
882
883
static char *match_encoding_name(char *enc)
884
213
{
885
213
  size_t i;
886
12.5k
  for (i = 0; i < nelem(encoding_aliases); ++i)
887
12.3k
    if (startswith(enc, encoding_aliases[i].alias))
888
0
      return encoding_aliases[i].encoding;
889
213
  return NULL;
890
213
}
891
892
// Look for encoding in <meta http-equiv="content-type" content="text/html; charset=XXX"> tags
893
static const char *find_meta_encoding(char *s)
894
687
{
895
687
  const char *table = NULL;
896
687
  char *end, *meta, *charset, *enc;
897
898
687
  meta = fast_strcasestr(s, "<meta");
899
687
  while (meta && !table)
900
0
  {
901
0
    end = strchr(meta, '>');
902
0
    if (end)
903
0
    {
904
0
      *end = 0;
905
0
      if (fast_strcasestr(meta, "http-equiv") && fast_strcasestr(meta, "content-type"))
906
0
      {
907
0
        charset = fast_strcasestr(meta, "charset=");
908
0
        if (charset)
909
0
        {
910
0
          enc = match_encoding_name(charset + 8);
911
0
          if (enc)
912
0
            table = enc;
913
0
        }
914
0
      }
915
0
      *end = '>';
916
0
    }
917
0
    meta = fast_strcasestr(meta + 5, "<meta");
918
0
  }
919
920
687
  return table;
921
687
}
922
923
static const char *find_xml_encoding(char *s)
924
687
{
925
687
  const char *table = NULL;
926
687
  char *end, *xml, *enc;
927
928
687
  end = strchr(s, '>');
929
687
  if (end)
930
677
  {
931
677
    *end = 0;
932
677
    xml = strstr(s, "<?xml");
933
677
    if (xml)
934
251
    {
935
251
      enc = strstr(xml, "encoding=");
936
251
      if (enc)
937
213
      {
938
213
        enc = match_encoding_name(enc + 10);
939
213
        if (enc)
940
0
          table = enc;
941
213
      }
942
251
    }
943
677
    *end = '>';
944
677
  }
945
946
687
  if (!table)
947
687
    table = find_meta_encoding(s);
948
949
687
  return table;
950
687
}
951
952
static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree)
953
687
{
954
687
  fz_text_decoder dec;
955
687
  const char *enc;
956
687
  const unsigned char *e = s + n;
957
687
  char *dst, *d;
958
687
  int m;
959
687
  int c;
960
961
687
  if (s[0] == 0xFE && s[1] == 0xFF) {
962
0
    s += 2;
963
0
    dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_be");
964
0
    while (s + 1 < e) {
965
0
      c = s[0] << 8 | s[1];
966
0
      d += fz_runetochar(d, c);
967
0
      s += 2;
968
0
    }
969
0
    *d = 0;
970
0
    *dofree = 1;
971
0
    return dst;
972
0
  }
973
974
687
  if (s[0] == 0xFF && s[1] == 0xFE) {
975
0
    s += 2;
976
0
    dst = d = Memento_label(fz_malloc(ctx, n * FZ_UTFMAX), "utf8_from_le");
977
0
    while (s + 1 < e) {
978
0
      c = s[0] | s[1] << 8;
979
0
      d += fz_runetochar(d, c);
980
0
      s += 2;
981
0
    }
982
0
    *d = 0;
983
0
    *dofree = 1;
984
0
    return dst;
985
0
  }
986
987
687
  enc = find_xml_encoding((char*)s);
988
687
  if (enc)
989
0
  {
990
0
    fz_init_text_decoder(ctx, &dec, enc);
991
    // NOTE: use decode_size if memory is more important than speed
992
0
    m = dec.decode_bound(&dec, s, n);
993
0
    dst = Memento_label(fz_malloc(ctx, m), "utf8");
994
0
    dec.decode(&dec, dst, s, n);
995
0
    *dofree = 1;
996
0
    return dst;
997
0
  }
998
999
687
  *dofree = 0;
1000
1001
687
  if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
1002
24
    return (char*)s+3;
1003
1004
663
  return (char*)s;
1005
687
}
1006
1007
fz_xml *
1008
fz_parse_xml_stream(fz_context *ctx, fz_stream *stm, int preserve_white)
1009
0
{
1010
0
  fz_buffer *buf = fz_read_all(ctx, stm, 128);
1011
0
  fz_xml *xml = NULL;
1012
1013
0
  fz_var(xml);
1014
1015
0
  fz_try(ctx)
1016
0
    xml = fz_parse_xml(ctx, buf, preserve_white);
1017
0
  fz_always(ctx)
1018
0
    fz_drop_buffer(ctx, buf);
1019
0
  fz_catch(ctx)
1020
0
    fz_rethrow(ctx);
1021
1022
0
  return xml;
1023
0
}
1024
1025
static fz_xml *
1026
parse_and_drop_buffer(fz_context *ctx, fz_buffer *buf, int preserve_white)
1027
209
{
1028
209
  fz_xml *xml = NULL;
1029
1030
209
  fz_var(xml);
1031
1032
418
  fz_try(ctx)
1033
418
    xml = fz_parse_xml(ctx, buf, preserve_white);
1034
418
  fz_always(ctx)
1035
209
    fz_drop_buffer(ctx, buf);
1036
209
  fz_catch(ctx)
1037
12
    fz_rethrow(ctx);
1038
1039
197
  return xml;
1040
209
}
1041
1042
fz_xml *
1043
fz_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
1044
136
{
1045
136
  fz_buffer *buf = fz_read_archive_entry(ctx, arch, filename);
1046
1047
136
  return parse_and_drop_buffer(ctx, buf, preserve_white);
1048
136
}
1049
1050
fz_xml *
1051
fz_try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
1052
283
{
1053
283
  fz_buffer *buf = fz_try_read_archive_entry(ctx, arch, filename);
1054
1055
283
  if (buf == NULL)
1056
179
    return NULL;
1057
1058
104
  return parse_and_drop_buffer(ctx, buf, preserve_white);
1059
283
}
1060
1061
fz_xml *
1062
fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white)
1063
634
{
1064
634
  struct parser parser;
1065
634
  fz_xml *xml = NULL;
1066
634
  fz_xml root, *node;
1067
634
  char *p = NULL;
1068
634
  char *error;
1069
634
  int dofree = 0;
1070
634
  unsigned char *s;
1071
634
  size_t n;
1072
634
  static unsigned char empty_string[] = "";
1073
1074
634
  fz_var(dofree);
1075
634
  fz_var(p);
1076
1077
634
  if (buf == NULL)
1078
0
  {
1079
0
    n = 0;
1080
0
    s = empty_string;
1081
0
  }
1082
634
  else
1083
634
  {
1084
    /* ensure we are zero-terminated */
1085
634
    fz_terminate_buffer(ctx, buf);
1086
634
    n = fz_buffer_storage(ctx, buf, &s);
1087
634
  }
1088
1089
634
  memset(&root, 0, sizeof(root));
1090
634
  parser.pool = fz_new_pool(ctx);
1091
634
  parser.head = &root;
1092
634
  parser.preserve_white = preserve_white;
1093
634
  parser.depth = 0;
1094
#ifdef FZ_XML_SEQ
1095
  parser.seq = 0;
1096
#endif
1097
1098
1.26k
  fz_try(ctx)
1099
1.26k
  {
1100
634
    p = convert_to_utf8(ctx, s, n, &dofree);
1101
1102
634
    error = xml_parse_document_imp(ctx, &parser, p);
1103
634
    if (error)
1104
77
      fz_throw(ctx, FZ_ERROR_SYNTAX, "%s", error);
1105
1106
2.62k
    for (node = parser.head; node; node = node->up)
1107
2.06k
      node->u.node.next = NULL;
1108
1109
557
    xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
1110
557
    xml->up = NULL;
1111
557
    xml->down = root.down;
1112
557
    xml->u.doc.refs = 1;
1113
557
    xml->u.doc.pool = parser.pool;
1114
1115
1.11k
    for (node = root.down; node; node = node->u.node.next)
1116
555
      node->up = xml;
1117
557
  }
1118
1.26k
  fz_always(ctx)
1119
634
  {
1120
634
    if (dofree)
1121
0
      fz_free(ctx, p);
1122
634
  }
1123
634
  fz_catch(ctx)
1124
77
  {
1125
77
    fz_drop_pool(ctx, parser.pool);
1126
77
    fz_rethrow(ctx);
1127
77
  }
1128
1129
480
  return xml;
1130
557
}
1131
1132
/*
1133
  Parse the contents of buffer into a tree of XML nodes, using the HTML5 syntax.
1134
1135
  Gumbo doesn't check for malloc errors. Use our pool allocator and let it longjmp
1136
  out of Gumbo on allocation errors. At the end (success or fail) we release the
1137
  pool used for Gumbo's parse tree all at once.
1138
*/
1139
1140
struct mem_gumbo {
1141
  fz_context *ctx;
1142
  fz_pool *pool;
1143
};
1144
1145
static void *alloc_gumbo(void *ctx, size_t size)
1146
9.54k
{
1147
9.54k
  struct mem_gumbo *mem = ctx;
1148
9.54k
  return fz_pool_alloc(mem->ctx, mem->pool, size);
1149
9.54k
}
1150
1151
static void dealloc_gumbo(void *ctx, void *ptr)
1152
9.64k
{
1153
  /* nothing */
1154
9.64k
}
1155
1156
static void xml_from_gumbo(fz_context *ctx, struct parser *parser, GumboNode *node)
1157
1.25k
{
1158
1.25k
  unsigned int i;
1159
1.25k
  const char *tag, *end, *sentinel;
1160
1161
1.25k
  switch (node->type)
1162
1.25k
  {
1163
619
  case GUMBO_NODE_ELEMENT:
1164
619
    if (node->v.element.tag != GUMBO_TAG_UNKNOWN)
1165
619
    {
1166
619
      tag = gumbo_normalized_tagname(node->v.element.tag);
1167
619
      end = tag + strlen(tag);
1168
619
    }
1169
0
    else
1170
0
    {
1171
0
      tag = node->v.element.original_tag.data;
1172
0
      sentinel = tag + node->v.element.original_tag.length;
1173
0
      if (tag[0] == '<')
1174
0
        ++tag;
1175
0
      for (end = tag; end < sentinel; ++end)
1176
0
        if (end[0] == '>' || end[0] == '/' || iswhite(end[0]))
1177
0
          break;
1178
0
    }
1179
619
    xml_emit_open_tag(ctx, parser, tag, end, 0);
1180
905
    for (i = 0; i < node->v.element.attributes.length; ++i)
1181
286
    {
1182
286
      GumboAttribute *att = node->v.element.attributes.data[i];
1183
286
      xml_emit_att_name(ctx, parser, att->name, att->name+strlen(att->name));
1184
286
      xml_emit_att_value(ctx, parser, att->value, att->value+strlen(att->value));
1185
286
    }
1186
1.82k
    for (i = 0; i < node->v.element.children.length; ++i)
1187
1.20k
    {
1188
1.20k
      GumboNode *child = node->v.element.children.data[i];
1189
1.20k
      xml_from_gumbo(ctx, parser, child);
1190
1.20k
    }
1191
619
    xml_emit_close_tag(ctx, parser);
1192
619
    break;
1193
1194
247
  case GUMBO_NODE_TEXT:
1195
247
  case GUMBO_NODE_CDATA:
1196
640
  case GUMBO_NODE_WHITESPACE:
1197
640
    xml_emit_text(ctx, parser, node->v.text.text, node->v.text.text+strlen(node->v.text.text));
1198
640
    break;
1199
1200
0
  case GUMBO_NODE_DOCUMENT:
1201
0
  case GUMBO_NODE_COMMENT:
1202
0
  case GUMBO_NODE_TEMPLATE:
1203
0
    break;
1204
1.25k
  }
1205
1.25k
}
1206
1207
fz_xml *
1208
fz_parse_xml_from_html5(fz_context *ctx, fz_buffer *buf)
1209
53
{
1210
53
  struct parser parser;
1211
53
  fz_xml *xml = NULL;
1212
53
  fz_xml root, *node;
1213
53
  char *p = NULL;
1214
53
  int dofree = 0;
1215
53
  unsigned char *s;
1216
53
  size_t n;
1217
53
  GumboOutput *soup = NULL;
1218
53
  GumboOptions opts;
1219
53
  struct mem_gumbo mem;
1220
53
  static unsigned char empty_string[] = "";
1221
1222
53
  fz_var(mem.pool);
1223
53
  fz_var(soup);
1224
53
  fz_var(dofree);
1225
53
  fz_var(p);
1226
1227
53
  if (buf == NULL)
1228
0
  {
1229
0
    n = 0;
1230
0
    s = empty_string;
1231
0
  }
1232
53
  else
1233
53
  {
1234
    /* ensure we are zero-terminated */
1235
53
    fz_terminate_buffer(ctx, buf);
1236
53
    n = fz_buffer_storage(ctx, buf, &s);
1237
53
  }
1238
1239
53
  mem.ctx = ctx;
1240
53
  mem.pool = NULL;
1241
1242
53
  memset(&root, 0, sizeof(root));
1243
53
  parser.pool = fz_new_pool(ctx);
1244
53
  parser.head = &root;
1245
53
  parser.preserve_white = 1;
1246
53
  parser.depth = 0;
1247
#ifdef FZ_XML_SEQ
1248
  parser.seq = 0;
1249
#endif
1250
1251
106
  fz_try(ctx)
1252
106
  {
1253
53
    p = convert_to_utf8(ctx, s, n, &dofree);
1254
1255
53
    mem.pool = fz_new_pool(ctx);
1256
53
    memset(&opts, 0, sizeof opts);
1257
53
    opts.allocator = alloc_gumbo;
1258
53
    opts.deallocator = dealloc_gumbo;
1259
53
    opts.userdata = &mem;
1260
53
    opts.tab_stop = 8;
1261
53
    opts.stop_on_first_error = 0;
1262
53
    opts.max_errors = -1;
1263
53
    opts.fragment_context = GUMBO_TAG_LAST;
1264
53
    opts.fragment_namespace = GUMBO_NAMESPACE_HTML;
1265
1266
53
    soup = gumbo_parse_with_options(&opts, (const char *)p, strlen(p));
1267
1268
53
    xml_from_gumbo(ctx, &parser, soup->root);
1269
1270
106
    for (node = parser.head; node; node = node->up)
1271
53
      node->u.node.next = NULL;
1272
1273
53
    xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
1274
53
    xml->up = NULL;
1275
53
    xml->down = root.down;
1276
53
    xml->u.doc.pool = parser.pool;
1277
53
    xml->u.doc.refs = 1;
1278
1279
106
    for (node = root.down; node; node = node->u.node.next)
1280
53
      node->up = xml;
1281
53
  }
1282
106
  fz_always(ctx)
1283
53
  {
1284
53
    if (soup)
1285
53
      gumbo_destroy_output(&opts, soup);
1286
53
    fz_drop_pool(ctx, mem.pool);
1287
53
    if (dofree)
1288
0
      fz_free(ctx, p);
1289
53
  }
1290
53
  fz_catch(ctx)
1291
0
  {
1292
0
    fz_drop_pool(ctx, parser.pool);
1293
0
    fz_rethrow(ctx);
1294
0
  }
1295
1296
53
  return xml;
1297
53
}
1298
1299
fz_xml *fz_xml_find_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
1300
232
{
1301
232
  return fz_xml_find_dfs_top(item, tag, att, match, NULL);
1302
232
}
1303
1304
fz_xml *fz_xml_find_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
1305
248
{
1306
  /* Skip over any DOC object. */
1307
248
  if (item && FZ_DOCUMENT_ITEM(item))
1308
232
    item = item->down;
1309
1310
33.8k
  while (item)
1311
33.8k
  {
1312
33.8k
    if (!FZ_TEXT_ITEM(item) && (tag == NULL || !strcmp(item->u.node.u.d.name, tag)))
1313
455
    {
1314
455
      if (att == NULL || (match == NULL ? fz_xml_att(item, att) != NULL : fz_xml_att_eq(item, att, match)))
1315
137
        return item;
1316
455
    }
1317
1318
33.6k
    if (!FZ_TEXT_ITEM(item) && item->down)
1319
15.0k
      item = item->down;
1320
18.6k
    else if (item->u.node.next)
1321
9.55k
      item = item->u.node.next;
1322
9.09k
    else
1323
14.9k
      while (1) {
1324
14.9k
        item = item->up;
1325
        /* Stop searching if we hit our declared 'top' item. */
1326
14.9k
        if (item == top)
1327
0
          return NULL;
1328
        /* We should never reach item == NULL, but just in case. */
1329
14.9k
        if (item == NULL)
1330
0
          return NULL;
1331
        /* If we reach the DOC object at the top, we're done. */
1332
14.9k
        if (item->up == NULL)
1333
109
          return NULL;
1334
14.8k
        if (item->u.node.next)
1335
8.99k
        {
1336
8.99k
          item = item->u.node.next;
1337
8.99k
          break;
1338
8.99k
        }
1339
14.8k
      }
1340
33.6k
  }
1341
1342
2
  return NULL;
1343
248
}
1344
1345
fz_xml *fz_xml_find_next_dfs(fz_xml *item, const char *tag, const char *att, const char *match)
1346
24
{
1347
24
  return fz_xml_find_next_dfs_top(item, tag, att, match, NULL);
1348
24
}
1349
1350
fz_xml *fz_xml_find_next_dfs_top(fz_xml *item, const char *tag, const char *att, const char *match, fz_xml *top)
1351
24
{
1352
  /* Skip over any DOC object. */
1353
24
  if (item && FZ_DOCUMENT_ITEM(item))
1354
0
    item = item->down;
1355
1356
24
  if (item == NULL)
1357
0
    return NULL;
1358
1359
24
  if (item->down)
1360
4
    item = item->down;
1361
20
  else if (item->u.node.next)
1362
12
    item = item->u.node.next;
1363
8
  else
1364
16
    while (1) {
1365
16
      item = item->up;
1366
      /* Stop searching if we hit our declared 'top' item. */
1367
16
      if (item == top)
1368
0
        return NULL;
1369
      /* We should never reach item == NULL, but just in case. */
1370
16
      if (item == NULL)
1371
0
        return NULL;
1372
      /* If we reach the DOC object at the top, we're done. */
1373
16
      if (item->up == NULL)
1374
8
        return NULL;
1375
8
      if (item->u.node.next)
1376
0
      {
1377
0
        item = item->u.node.next;
1378
0
        break;
1379
0
      }
1380
8
    }
1381
1382
16
  return fz_xml_find_dfs_top(item, tag, att, match, top);
1383
24
}
1384
1385
fz_xml *fz_keep_xml(fz_context *ctx, fz_xml *xml)
1386
0
{
1387
0
  fz_xml *dom = xml;
1388
0
  if (xml == NULL)
1389
0
    return xml;
1390
1391
0
  while (dom->up)
1392
0
    dom = dom->up;
1393
1394
0
  fz_keep_imp(ctx, dom, &dom->u.doc.refs);
1395
1396
  /* Return the original node pointer, not the dom pointer! */
1397
0
  return xml;
1398
0
}