/src/tidy-html5/src/lexer.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* lexer.c -- Lexer for html parser |
2 | | |
3 | | (c) 1998-2008 (W3C) MIT, ERCIM, Keio University |
4 | | See tidy.h for the copyright notice. |
5 | | |
6 | | */ |
7 | | |
8 | | /* |
9 | | Given a file stream fp it returns a sequence of tokens. |
10 | | |
11 | | GetToken(fp) gets the next token |
12 | | UngetToken(fp) provides one level undo |
13 | | |
14 | | The tags include an attribute list: |
15 | | |
16 | | - linked list of attribute/value nodes |
17 | | - each node has 2 NULL-terminated strings. |
18 | | - entities are replaced in attribute values |
19 | | |
20 | | white space is compacted if not in preformatted mode |
21 | | If not in preformatted mode then leading white space |
22 | | is discarded and subsequent white space sequences |
23 | | compacted to single space characters. |
24 | | |
25 | | If XmlTags is no then Tag names are folded to upper |
26 | | case and attribute names to lower case. |
27 | | |
28 | | Not yet done: |
29 | | - Doctype subset and marked sections |
30 | | */ |
31 | | |
32 | | #include "tidy-int.h" |
33 | | #include "lexer.h" |
34 | | #include "parser.h" |
35 | | #include "entities.h" |
36 | | #include "streamio.h" |
37 | | #include "message.h" |
38 | | #include "tmbstr.h" |
39 | | #include "clean.h" |
40 | | #include "utf8.h" |
41 | | #include "streamio.h" |
42 | | #include "sprtf.h" |
43 | | |
44 | | #if defined(ENABLE_DEBUG_LOG) |
45 | | /* #define DEBUG_ALLOCATION special EXTRA allocation debug information - VERY NOISY */ |
46 | | static void check_me(char *name); |
47 | | static Bool show_attrs = yes; |
48 | | #define MX_TXT 8 |
49 | | static char buffer[(MX_TXT*4)+8]; /* NOTE extra for '...'\0 tail */ |
50 | | static tmbstr get_text_string(Lexer* lexer, Node *node) |
51 | | { |
52 | | uint len = node->end - node->start; |
53 | | tmbstr cp = lexer->lexbuf + node->start; |
54 | | tmbstr end = lexer->lexbuf + node->end; |
55 | | unsigned char c; |
56 | | uint i = 0; |
57 | | Bool insp = no; |
58 | | if (len <= ((MX_TXT * 2) + 3)) { |
59 | | buffer[0] = 0; |
60 | | while (cp < end) { |
61 | | c = *cp; |
62 | | cp++; |
63 | | if (c == '\n') { |
64 | | buffer[i++] = '\\'; |
65 | | buffer[i++] = 'n'; |
66 | | } else if (c == '\t') { |
67 | | buffer[i++] = '\\'; |
68 | | buffer[i++] = 't'; |
69 | | } else if ( c == ' ' ) { |
70 | | if (!insp) |
71 | | buffer[i++] = c; |
72 | | insp = yes; |
73 | | } else { |
74 | | buffer[i++] = c; |
75 | | insp = no; |
76 | | } |
77 | | } |
78 | | } else { |
79 | | char *end1 = cp + MX_TXT; |
80 | | char *bgn = cp + (len - MX_TXT); |
81 | | buffer[0] = 0; |
82 | | if (bgn < end1) |
83 | | bgn = end1; |
84 | | while (cp < end1) { |
85 | | c = *cp; |
86 | | cp++; |
87 | | if (c == '\n') { |
88 | | buffer[i++] = '\\'; |
89 | | buffer[i++] = 'n'; |
90 | | } else if (c == '\t') { |
91 | | buffer[i++] = '\\'; |
92 | | buffer[i++] = 't'; |
93 | | } else if ( c == ' ' ) { |
94 | | if (!insp) |
95 | | buffer[i++] = c; |
96 | | insp = yes; |
97 | | } else { |
98 | | buffer[i++] = c; |
99 | | insp = no; |
100 | | } |
101 | | if (i >= MX_TXT) |
102 | | break; |
103 | | } |
104 | | c = '.'; |
105 | | if ((i < len)&&(cp < bgn)) { |
106 | | buffer[i++] = c; |
107 | | cp++; |
108 | | if ((i < len)&&(cp < bgn)) { |
109 | | buffer[i++] = c; |
110 | | cp++; |
111 | | if ((i < len)&&(cp < bgn)) { |
112 | | buffer[i++] = c; |
113 | | cp++; |
114 | | } |
115 | | } |
116 | | } |
117 | | cp = bgn; |
118 | | insp = no; |
119 | | while (cp < end) { |
120 | | c = *cp; |
121 | | cp++; |
122 | | if (c == '\n') { |
123 | | buffer[i++] = '\\'; |
124 | | buffer[i++] = 'n'; |
125 | | } else if (c == '\t') { |
126 | | buffer[i++] = '\\'; |
127 | | buffer[i++] = 't'; |
128 | | } else if ( c == ' ' ) { |
129 | | if (!insp) |
130 | | buffer[i++] = c; |
131 | | insp = yes; |
132 | | } else { |
133 | | buffer[i++] = c; |
134 | | insp = no; |
135 | | } |
136 | | } |
137 | | } |
138 | | buffer[i] = 0; |
139 | | return buffer; |
140 | | } |
141 | | static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node ) |
142 | | { |
143 | | Lexer* lexer = doc->lexer; |
144 | | Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? yes : no; |
145 | | int line = ( doc->lexer ? doc->lexer->lines : 0 ); |
146 | | int col = ( doc->lexer ? doc->lexer->columns : 0 ); |
147 | | tmbstr src = lex ? "lexer" : "stream"; |
148 | | SPRTF("R=%d C=%d: ", line, col ); |
149 | | /* DEBUG: Be able to set a TRAP on a SPECIFIC row,col */ |
150 | | if ((line == 3) && (col == 1)) { |
151 | | check_me("Show_Node"); /* just a debug trap */ |
152 | | } |
153 | | if (lexer && lexer->token && |
154 | | ((lexer->token->type == TextNode)||(node && (node->type == TextNode)))) { |
155 | | if (show_attrs) { |
156 | | uint len = node ? node->end - node->start : 0; |
157 | | tmbstr cp = node ? get_text_string( lexer, node ) : "NULL"; |
158 | | SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len, src ); |
159 | | } else { |
160 | | SPRTF("Returning %s TextNode %p... %s\n", msg, node, src ); |
161 | | } |
162 | | } else { |
163 | | tmbstr name = node ? node->element ? node->element : "blank" : "NULL"; |
164 | | if (show_attrs) { |
165 | | AttVal* av; |
166 | | SPRTF("Returning %s node <%s", msg, name); |
167 | | if (node) { |
168 | | for (av = node->attributes; av; av = av->next) { |
169 | | name = av->attribute; |
170 | | if (name) { |
171 | | SPRTF(" %s",name); |
172 | | if (av->value) { |
173 | | SPRTF("=\"%s\"", av->value); |
174 | | } |
175 | | } |
176 | | } |
177 | | } |
178 | | SPRTF("> %s\n", src); |
179 | | } else { |
180 | | SPRTF("Returning %s node %p <%s>... %s\n", msg, node, |
181 | | name, src ); |
182 | | } |
183 | | } |
184 | | } |
185 | | #define GTDBG(a,b,c) Show_Node(a,b,c) |
186 | | #else /* ENABLE_DEBUG_LOG */ |
187 | | #define GTDBG(a,b,c) |
188 | | #endif /* defined(ENABLE_DEBUG_LOG) */ |
189 | | |
190 | | /* Forward references |
191 | | */ |
192 | | /* swallows closing '>' */ |
193 | | static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty ); |
194 | | |
195 | | static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, |
196 | | Node **asp, Node **php ); |
197 | | |
198 | | static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase, |
199 | | Bool *isempty, int *pdelim ); |
200 | | |
201 | | static Node *ParseDocTypeDecl(TidyDocImpl* doc); |
202 | | |
203 | | static void AddAttrToList( AttVal** list, AttVal* av ); |
204 | | |
205 | | /* used to classify characters for lexical purposes */ |
206 | 404M | #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0) |
207 | | static uint lexmap[128]; |
208 | | |
209 | 0 | #define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name) |
210 | 16.0k | #define IsValidXMLElemName(name) TY_(IsValidXMLID)(name) |
211 | | |
212 | | static struct _doctypes |
213 | | { |
214 | | uint score; |
215 | | uint vers; |
216 | | uint vers_out; |
217 | | Bool xhtml; |
218 | | ctmbstr name; |
219 | | ctmbstr fpi; |
220 | | ctmbstr si; |
221 | | } const W3C_Doctypes[] = |
222 | | { |
223 | | { 2, HT20, 200, no, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, }, |
224 | | { 2, HT20, 200, no, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, }, |
225 | | { 2, HT20, 200, no, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, }, |
226 | | { 1, HT32, 320, no, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, }, |
227 | | { 1, HT32, 320, no, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, }, |
228 | | { 1, HT32, 320, no, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, }, |
229 | | { 6, H40S, 400, no, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" }, |
230 | | { 8, H40T, 400, no, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" }, |
231 | | { 7, H40F, 400, no, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" }, |
232 | | { 3, H41S, 401, no, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" }, |
233 | | { 5, H41T, 401, no, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" }, |
234 | | { 4, H41F, 401, no, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" }, |
235 | | { 9, X10S, 100, yes, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" }, |
236 | | { 11, X10T, 100, yes, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" }, |
237 | | { 10, X10F, 100, yes, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" }, |
238 | | { 12, XH11, 110, yes, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" }, |
239 | | { 13, XB10, 100, yes, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" }, |
240 | | |
241 | | { 20, HT50, 500, no, "HTML5", NULL, NULL }, |
242 | | { 21, XH50, 500, yes, "XHTML5", NULL, NULL }, |
243 | | |
244 | | /* final entry */ |
245 | | { 0, 0, 0, no, NULL, NULL, NULL } |
246 | | }; |
247 | | |
248 | | /* |
249 | | * Issue #643 - Since VERS_FROM40 was extended to include VERS_HTML5 |
250 | | * to be used in the expanded entity table some 155 times, |
251 | | * need a special macro here to denote just HTML 4 plus XHTML, |
252 | | * which is actually the former define of VERS_FROM40 |
253 | | */ |
254 | 3.15M | #define VERS_HMTL40PX (VERS_HTML40|VERS_XHTML11|VERS_BASIC) |
255 | | |
256 | | int TY_(HTMLVersion)(TidyDocImpl* doc) |
257 | 3.02M | { |
258 | 3.02M | uint i; |
259 | 3.02M | uint j = 0; |
260 | 3.02M | uint score = 0; |
261 | 3.02M | uint vers = doc->lexer->versions; |
262 | 3.02M | uint dtver = doc->lexer->doctype; |
263 | 3.02M | TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); |
264 | 3.02M | Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) && |
265 | 3.02M | !cfgBool(doc, TidyHtmlOut); |
266 | 3.02M | Bool html4 = ((dtmode == TidyDoctypeStrict) || (dtmode == TidyDoctypeLoose) || |
267 | 3.02M | (VERS_HMTL40PX & dtver) ? yes : no); |
268 | 3.02M | Bool html5 = (!html4 && ((dtmode == TidyDoctypeAuto) || |
269 | 3.00M | (dtmode == TidyDoctypeHtml5)) ? yes : no); |
270 | | |
271 | 3.02M | if (xhtml && dtver == VERS_UNKNOWN) return XH50; |
272 | 27.6k | if (dtver == VERS_UNKNOWN) return HT50; |
273 | | /* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */ |
274 | 27.6k | if (!xhtml && (dtver == VERS_HTML5)) return HT50; |
275 | | /* Issue #377 - If xhtml and (doctype == html5) and constrained vers contains XH50 return that, |
276 | | and really if tidy defaults to 'html5', then maybe 'auto' should also apply! */ |
277 | 27.6k | if (xhtml && html5 && ((vers & VERS_HTML5) == XH50)) return XH50; |
278 | | |
279 | 537k | for (i = 0; W3C_Doctypes[i].name; ++i) |
280 | 510k | { |
281 | 510k | if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) || |
282 | 510k | (html4 && !(VERS_HMTL40PX & W3C_Doctypes[i].vers))) |
283 | 370k | continue; |
284 | | |
285 | 139k | if (vers & W3C_Doctypes[i].vers && |
286 | 139k | (W3C_Doctypes[i].score < score || !score)) |
287 | 19.9k | { |
288 | 19.9k | score = W3C_Doctypes[i].score; |
289 | 19.9k | j = i; |
290 | 19.9k | } |
291 | 139k | } |
292 | | |
293 | 26.8k | if (score) |
294 | 18.4k | return W3C_Doctypes[j].vers; |
295 | | |
296 | 8.37k | return VERS_UNKNOWN; |
297 | 26.8k | } |
298 | | |
299 | | static ctmbstr GetFPIFromVers(uint vers) |
300 | 2.20k | { |
301 | 2.20k | uint i; |
302 | | |
303 | 43.5k | for (i = 0; W3C_Doctypes[i].name; ++i) |
304 | 41.4k | if (W3C_Doctypes[i].vers == vers) |
305 | 80 | return W3C_Doctypes[i].fpi; |
306 | | |
307 | 2.12k | return NULL; |
308 | 2.20k | } |
309 | | |
310 | | static ctmbstr GetSIFromVers(uint vers) |
311 | 0 | { |
312 | 0 | uint i; |
313 | |
|
314 | 0 | for (i = 0; W3C_Doctypes[i].name; ++i) |
315 | 0 | if (W3C_Doctypes[i].vers == vers) |
316 | 0 | return W3C_Doctypes[i].si; |
317 | | |
318 | 0 | return NULL; |
319 | 0 | } |
320 | | |
321 | | static ctmbstr GetNameFromVers(uint vers) |
322 | 0 | { |
323 | 0 | uint i; |
324 | |
|
325 | 0 | for (i = 0; W3C_Doctypes[i].name; ++i) |
326 | 0 | if (W3C_Doctypes[i].vers == vers) |
327 | 0 | return W3C_Doctypes[i].name; |
328 | | |
329 | 0 | return NULL; |
330 | 0 | } |
331 | | |
332 | | static uint GetVersFromFPI(ctmbstr fpi) |
333 | 2.20k | { |
334 | 2.20k | uint i; |
335 | | |
336 | 43.5k | for (i = 0; W3C_Doctypes[i].name; ++i) |
337 | 41.4k | if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0) |
338 | 80 | return W3C_Doctypes[i].vers; |
339 | | |
340 | 2.12k | return 0; |
341 | 2.20k | } |
342 | | |
343 | | #ifdef ENABLE_DEBUG_LOG |
344 | | # ifndef EndBuf |
345 | | # define EndBuf(a) ( a + strlen(a) ) |
346 | | # endif |
347 | | |
348 | | /* Issue #377 - Output diminishing version bits */ |
349 | | typedef struct tagV2S { |
350 | | uint bit; |
351 | | ctmbstr val; |
352 | | }V2S, *PV2S; |
353 | | |
354 | | static V2S v2s[] = { |
355 | | { HT20, "HT20" }, |
356 | | { HT32, "HT32" }, |
357 | | { H40S, "H40S" }, |
358 | | { H40T, "H40T" }, |
359 | | { H40F, "H40F" }, |
360 | | { H41S, "H41S" }, |
361 | | { H41T, "H41T" }, |
362 | | { H41F, "H41F" }, |
363 | | { X10S, "X10S" }, |
364 | | { X10T, "X10T" }, |
365 | | { X10F, "X10F" }, |
366 | | { XH11, "XH11" }, |
367 | | { XB10, "XB10" }, /* 4096u */ |
368 | | /* { VERS_SUN, "VSUN" }, */ |
369 | | /* { VERS_NETSCAPE, "VNET" }, */ |
370 | | /* { VERS_MICROSOFT, "VMIC" }, 32768u */ |
371 | | { VERS_XML, "VXML" }, /* 65536u */ |
372 | | /* HTML5 */ |
373 | | { HT50, "HT50" }, /* 131072u */ |
374 | | { XH50, "XH50" }, /* 262144u */ |
375 | | { 0, 0 } |
376 | | }; |
377 | | |
378 | | /* Process the above table, adding a bit name, |
379 | | or '----' when not present */ |
380 | | static char *add_vers_string( tmbstr buf, uint vers ) |
381 | | { |
382 | | PV2S pv2s = v2s; |
383 | | int len = (int)strlen(buf); |
384 | | while (pv2s->val) { |
385 | | if (vers & pv2s->bit) { |
386 | | if (len) { |
387 | | strcat(buf,"|"); |
388 | | len++; |
389 | | } |
390 | | strcat(buf,pv2s->val); |
391 | | len += (int)strlen(pv2s->val); |
392 | | vers &= ~(pv2s->bit); |
393 | | if (!vers) |
394 | | break; |
395 | | } else { |
396 | | if (len) { |
397 | | strcat(buf,"|"); |
398 | | len++; |
399 | | } |
400 | | strcat(buf,"----"); |
401 | | len += 4; |
402 | | |
403 | | } |
404 | | pv2s++; |
405 | | } |
406 | | if (vers) { /* Should not have any here! */ |
407 | | if (len) |
408 | | strcat(buf,"|"); |
409 | | sprintf(EndBuf(buf),"%u",vers); |
410 | | } |
411 | | return buf; |
412 | | |
413 | | } |
414 | | |
415 | | /* Issue #377 - Show first Before: list, and then on any change |
416 | | Note the VERS_PROPRIETARY are exclude since they always remain */ |
417 | | void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers) |
418 | | { |
419 | | static char vcur[256]; |
420 | | static Bool dnfirst = no; |
421 | | uint curr = doc->lexer->versions; /* get current */ |
422 | | doc->lexer->versions &= (vers | VERS_PROPRIETARY); |
423 | | if (curr != doc->lexer->versions) { /* only if different */ |
424 | | if (!dnfirst) { |
425 | | dnfirst = yes; |
426 | | vcur[0] = 0; |
427 | | curr &= ~(VERS_PROPRIETARY); |
428 | | add_vers_string( vcur, curr ); |
429 | | SPRTF("Before: %s\n", vcur); |
430 | | } |
431 | | vcur[0] = 0; |
432 | | curr = doc->lexer->versions; |
433 | | curr &= ~(VERS_PROPRIETARY); |
434 | | add_vers_string( vcur, curr ); |
435 | | SPRTF("After : %s\n", vcur); |
436 | | } |
437 | | } |
438 | | #else /* !#if defined(ENABLE_DEBUG_LOG) */ |
439 | | /* everything is allowed in proprietary version of HTML */ |
440 | | /* this is handled here rather than in the tag/attr dicts */ |
441 | | void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers) |
442 | 1.14M | { |
443 | 1.14M | doc->lexer->versions &= (vers | VERS_PROPRIETARY); |
444 | 1.14M | } |
445 | | #endif /* #if defined(ENABLE_DEBUG_LOG) y/n */ |
446 | | |
447 | | Bool TY_(IsWhite)(uint c) |
448 | 208M | { |
449 | 208M | uint map = MAP(c); |
450 | | |
451 | 208M | return (map & white)!=0; |
452 | 208M | } |
453 | | |
454 | | Bool TY_(IsNewline)(uint c) |
455 | 0 | { |
456 | 0 | uint map = MAP(c); |
457 | 0 | return (map & newline)!=0; |
458 | 0 | } |
459 | | |
460 | | Bool TY_(IsDigit)(uint c) |
461 | 53.1k | { |
462 | 53.1k | uint map; |
463 | | |
464 | 53.1k | map = MAP(c); |
465 | | |
466 | 53.1k | return (map & digit)!=0; |
467 | 53.1k | } |
468 | | |
469 | | static Bool IsDigitHex(uint c) |
470 | 2.52M | { |
471 | 2.52M | uint map; |
472 | | |
473 | 2.52M | map = MAP(c); |
474 | | |
475 | 2.52M | return (map & digithex)!=0; |
476 | 2.52M | } |
477 | | |
478 | | Bool TY_(IsLetter)(uint c) |
479 | 1.81M | { |
480 | 1.81M | uint map; |
481 | | |
482 | 1.81M | map = MAP(c); |
483 | | |
484 | 1.81M | return (map & letter)!=0; |
485 | 1.81M | } |
486 | | |
487 | | Bool TY_(IsHTMLSpace)(uint c) |
488 | 1.12M | { |
489 | 1.12M | return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d; |
490 | 1.12M | } |
491 | | |
492 | | Bool TY_(IsNamechar)(uint c) |
493 | 10.7M | { |
494 | 10.7M | uint map = MAP(c); |
495 | 10.7M | return (map & namechar)!=0; |
496 | 10.7M | } |
497 | | |
498 | | Bool TY_(IsXMLLetter)(uint c) |
499 | 387k | { |
500 | 387k | return ((c >= 0x41 && c <= 0x5a) || |
501 | 387k | (c >= 0x61 && c <= 0x7a) || |
502 | 387k | (c >= 0xc0 && c <= 0xd6) || |
503 | 387k | (c >= 0xd8 && c <= 0xf6) || |
504 | 387k | (c >= 0xf8 && c <= 0xff) || |
505 | 387k | (c >= 0x100 && c <= 0x131) || |
506 | 387k | (c >= 0x134 && c <= 0x13e) || |
507 | 387k | (c >= 0x141 && c <= 0x148) || |
508 | 387k | (c >= 0x14a && c <= 0x17e) || |
509 | 387k | (c >= 0x180 && c <= 0x1c3) || |
510 | 387k | (c >= 0x1cd && c <= 0x1f0) || |
511 | 387k | (c >= 0x1f4 && c <= 0x1f5) || |
512 | 387k | (c >= 0x1fa && c <= 0x217) || |
513 | 387k | (c >= 0x250 && c <= 0x2a8) || |
514 | 387k | (c >= 0x2bb && c <= 0x2c1) || |
515 | 387k | c == 0x386 || |
516 | 387k | (c >= 0x388 && c <= 0x38a) || |
517 | 387k | c == 0x38c || |
518 | 387k | (c >= 0x38e && c <= 0x3a1) || |
519 | 387k | (c >= 0x3a3 && c <= 0x3ce) || |
520 | 387k | (c >= 0x3d0 && c <= 0x3d6) || |
521 | 387k | c == 0x3da || |
522 | 387k | c == 0x3dc || |
523 | 387k | c == 0x3de || |
524 | 387k | c == 0x3e0 || |
525 | 387k | (c >= 0x3e2 && c <= 0x3f3) || |
526 | 387k | (c >= 0x401 && c <= 0x40c) || |
527 | 387k | (c >= 0x40e && c <= 0x44f) || |
528 | 387k | (c >= 0x451 && c <= 0x45c) || |
529 | 387k | (c >= 0x45e && c <= 0x481) || |
530 | 387k | (c >= 0x490 && c <= 0x4c4) || |
531 | 387k | (c >= 0x4c7 && c <= 0x4c8) || |
532 | 387k | (c >= 0x4cb && c <= 0x4cc) || |
533 | 387k | (c >= 0x4d0 && c <= 0x4eb) || |
534 | 387k | (c >= 0x4ee && c <= 0x4f5) || |
535 | 387k | (c >= 0x4f8 && c <= 0x4f9) || |
536 | 387k | (c >= 0x531 && c <= 0x556) || |
537 | 387k | c == 0x559 || |
538 | 387k | (c >= 0x561 && c <= 0x586) || |
539 | 387k | (c >= 0x5d0 && c <= 0x5ea) || |
540 | 387k | (c >= 0x5f0 && c <= 0x5f2) || |
541 | 387k | (c >= 0x621 && c <= 0x63a) || |
542 | 387k | (c >= 0x641 && c <= 0x64a) || |
543 | 387k | (c >= 0x671 && c <= 0x6b7) || |
544 | 387k | (c >= 0x6ba && c <= 0x6be) || |
545 | 387k | (c >= 0x6c0 && c <= 0x6ce) || |
546 | 387k | (c >= 0x6d0 && c <= 0x6d3) || |
547 | 387k | c == 0x6d5 || |
548 | 387k | (c >= 0x6e5 && c <= 0x6e6) || |
549 | 387k | (c >= 0x905 && c <= 0x939) || |
550 | 387k | c == 0x93d || |
551 | 387k | (c >= 0x958 && c <= 0x961) || |
552 | 387k | (c >= 0x985 && c <= 0x98c) || |
553 | 387k | (c >= 0x98f && c <= 0x990) || |
554 | 387k | (c >= 0x993 && c <= 0x9a8) || |
555 | 387k | (c >= 0x9aa && c <= 0x9b0) || |
556 | 387k | c == 0x9b2 || |
557 | 387k | (c >= 0x9b6 && c <= 0x9b9) || |
558 | 387k | (c >= 0x9dc && c <= 0x9dd) || |
559 | 387k | (c >= 0x9df && c <= 0x9e1) || |
560 | 387k | (c >= 0x9f0 && c <= 0x9f1) || |
561 | 387k | (c >= 0xa05 && c <= 0xa0a) || |
562 | 387k | (c >= 0xa0f && c <= 0xa10) || |
563 | 387k | (c >= 0xa13 && c <= 0xa28) || |
564 | 387k | (c >= 0xa2a && c <= 0xa30) || |
565 | 387k | (c >= 0xa32 && c <= 0xa33) || |
566 | 387k | (c >= 0xa35 && c <= 0xa36) || |
567 | 387k | (c >= 0xa38 && c <= 0xa39) || |
568 | 387k | (c >= 0xa59 && c <= 0xa5c) || |
569 | 387k | c == 0xa5e || |
570 | 387k | (c >= 0xa72 && c <= 0xa74) || |
571 | 387k | (c >= 0xa85 && c <= 0xa8b) || |
572 | 387k | c == 0xa8d || |
573 | 387k | (c >= 0xa8f && c <= 0xa91) || |
574 | 387k | (c >= 0xa93 && c <= 0xaa8) || |
575 | 387k | (c >= 0xaaa && c <= 0xab0) || |
576 | 387k | (c >= 0xab2 && c <= 0xab3) || |
577 | 387k | (c >= 0xab5 && c <= 0xab9) || |
578 | 387k | c == 0xabd || |
579 | 387k | c == 0xae0 || |
580 | 387k | (c >= 0xb05 && c <= 0xb0c) || |
581 | 387k | (c >= 0xb0f && c <= 0xb10) || |
582 | 387k | (c >= 0xb13 && c <= 0xb28) || |
583 | 387k | (c >= 0xb2a && c <= 0xb30) || |
584 | 387k | (c >= 0xb32 && c <= 0xb33) || |
585 | 387k | (c >= 0xb36 && c <= 0xb39) || |
586 | 387k | c == 0xb3d || |
587 | 387k | (c >= 0xb5c && c <= 0xb5d) || |
588 | 387k | (c >= 0xb5f && c <= 0xb61) || |
589 | 387k | (c >= 0xb85 && c <= 0xb8a) || |
590 | 387k | (c >= 0xb8e && c <= 0xb90) || |
591 | 387k | (c >= 0xb92 && c <= 0xb95) || |
592 | 387k | (c >= 0xb99 && c <= 0xb9a) || |
593 | 387k | c == 0xb9c || |
594 | 387k | (c >= 0xb9e && c <= 0xb9f) || |
595 | 387k | (c >= 0xba3 && c <= 0xba4) || |
596 | 387k | (c >= 0xba8 && c <= 0xbaa) || |
597 | 387k | (c >= 0xbae && c <= 0xbb5) || |
598 | 387k | (c >= 0xbb7 && c <= 0xbb9) || |
599 | 387k | (c >= 0xc05 && c <= 0xc0c) || |
600 | 387k | (c >= 0xc0e && c <= 0xc10) || |
601 | 387k | (c >= 0xc12 && c <= 0xc28) || |
602 | 387k | (c >= 0xc2a && c <= 0xc33) || |
603 | 387k | (c >= 0xc35 && c <= 0xc39) || |
604 | 387k | (c >= 0xc60 && c <= 0xc61) || |
605 | 387k | (c >= 0xc85 && c <= 0xc8c) || |
606 | 387k | (c >= 0xc8e && c <= 0xc90) || |
607 | 387k | (c >= 0xc92 && c <= 0xca8) || |
608 | 387k | (c >= 0xcaa && c <= 0xcb3) || |
609 | 387k | (c >= 0xcb5 && c <= 0xcb9) || |
610 | 387k | c == 0xcde || |
611 | 387k | (c >= 0xce0 && c <= 0xce1) || |
612 | 387k | (c >= 0xd05 && c <= 0xd0c) || |
613 | 387k | (c >= 0xd0e && c <= 0xd10) || |
614 | 387k | (c >= 0xd12 && c <= 0xd28) || |
615 | 387k | (c >= 0xd2a && c <= 0xd39) || |
616 | 387k | (c >= 0xd60 && c <= 0xd61) || |
617 | 387k | (c >= 0xe01 && c <= 0xe2e) || |
618 | 387k | c == 0xe30 || |
619 | 387k | (c >= 0xe32 && c <= 0xe33) || |
620 | 387k | (c >= 0xe40 && c <= 0xe45) || |
621 | 387k | (c >= 0xe81 && c <= 0xe82) || |
622 | 387k | c == 0xe84 || |
623 | 387k | (c >= 0xe87 && c <= 0xe88) || |
624 | 387k | c == 0xe8a || |
625 | 387k | c == 0xe8d || |
626 | 387k | (c >= 0xe94 && c <= 0xe97) || |
627 | 387k | (c >= 0xe99 && c <= 0xe9f) || |
628 | 387k | (c >= 0xea1 && c <= 0xea3) || |
629 | 387k | c == 0xea5 || |
630 | 387k | c == 0xea7 || |
631 | 387k | (c >= 0xeaa && c <= 0xeab) || |
632 | 387k | (c >= 0xead && c <= 0xeae) || |
633 | 387k | c == 0xeb0 || |
634 | 387k | (c >= 0xeb2 && c <= 0xeb3) || |
635 | 387k | c == 0xebd || |
636 | 387k | (c >= 0xec0 && c <= 0xec4) || |
637 | 387k | (c >= 0xf40 && c <= 0xf47) || |
638 | 387k | (c >= 0xf49 && c <= 0xf69) || |
639 | 387k | (c >= 0x10a0 && c <= 0x10c5) || |
640 | 387k | (c >= 0x10d0 && c <= 0x10f6) || |
641 | 387k | c == 0x1100 || |
642 | 387k | (c >= 0x1102 && c <= 0x1103) || |
643 | 387k | (c >= 0x1105 && c <= 0x1107) || |
644 | 387k | c == 0x1109 || |
645 | 387k | (c >= 0x110b && c <= 0x110c) || |
646 | 387k | (c >= 0x110e && c <= 0x1112) || |
647 | 387k | c == 0x113c || |
648 | 387k | c == 0x113e || |
649 | 387k | c == 0x1140 || |
650 | 387k | c == 0x114c || |
651 | 387k | c == 0x114e || |
652 | 387k | c == 0x1150 || |
653 | 387k | (c >= 0x1154 && c <= 0x1155) || |
654 | 387k | c == 0x1159 || |
655 | 387k | (c >= 0x115f && c <= 0x1161) || |
656 | 387k | c == 0x1163 || |
657 | 387k | c == 0x1165 || |
658 | 387k | c == 0x1167 || |
659 | 387k | c == 0x1169 || |
660 | 387k | (c >= 0x116d && c <= 0x116e) || |
661 | 387k | (c >= 0x1172 && c <= 0x1173) || |
662 | 387k | c == 0x1175 || |
663 | 387k | c == 0x119e || |
664 | 387k | c == 0x11a8 || |
665 | 387k | c == 0x11ab || |
666 | 387k | (c >= 0x11ae && c <= 0x11af) || |
667 | 387k | (c >= 0x11b7 && c <= 0x11b8) || |
668 | 387k | c == 0x11ba || |
669 | 387k | (c >= 0x11bc && c <= 0x11c2) || |
670 | 387k | c == 0x11eb || |
671 | 387k | c == 0x11f0 || |
672 | 387k | c == 0x11f9 || |
673 | 387k | (c >= 0x1e00 && c <= 0x1e9b) || |
674 | 387k | (c >= 0x1ea0 && c <= 0x1ef9) || |
675 | 387k | (c >= 0x1f00 && c <= 0x1f15) || |
676 | 387k | (c >= 0x1f18 && c <= 0x1f1d) || |
677 | 387k | (c >= 0x1f20 && c <= 0x1f45) || |
678 | 387k | (c >= 0x1f48 && c <= 0x1f4d) || |
679 | 387k | (c >= 0x1f50 && c <= 0x1f57) || |
680 | 387k | c == 0x1f59 || |
681 | 387k | c == 0x1f5b || |
682 | 387k | c == 0x1f5d || |
683 | 387k | (c >= 0x1f5f && c <= 0x1f7d) || |
684 | 387k | (c >= 0x1f80 && c <= 0x1fb4) || |
685 | 387k | (c >= 0x1fb6 && c <= 0x1fbc) || |
686 | 387k | c == 0x1fbe || |
687 | 387k | (c >= 0x1fc2 && c <= 0x1fc4) || |
688 | 387k | (c >= 0x1fc6 && c <= 0x1fcc) || |
689 | 387k | (c >= 0x1fd0 && c <= 0x1fd3) || |
690 | 387k | (c >= 0x1fd6 && c <= 0x1fdb) || |
691 | 387k | (c >= 0x1fe0 && c <= 0x1fec) || |
692 | 387k | (c >= 0x1ff2 && c <= 0x1ff4) || |
693 | 387k | (c >= 0x1ff6 && c <= 0x1ffc) || |
694 | 387k | c == 0x2126 || |
695 | 387k | (c >= 0x212a && c <= 0x212b) || |
696 | 387k | c == 0x212e || |
697 | 387k | (c >= 0x2180 && c <= 0x2182) || |
698 | 387k | (c >= 0x3041 && c <= 0x3094) || |
699 | 387k | (c >= 0x30a1 && c <= 0x30fa) || |
700 | 387k | (c >= 0x3105 && c <= 0x312c) || |
701 | 387k | (c >= 0xac00 && c <= 0xd7a3) || |
702 | 387k | (c >= 0x4e00 && c <= 0x9fa5) || |
703 | 387k | c == 0x3007 || |
704 | 387k | (c >= 0x3021 && c <= 0x3029) || |
705 | 387k | (c >= 0x4e00 && c <= 0x9fa5) || |
706 | 387k | c == 0x3007 || |
707 | 387k | (c >= 0x3021 && c <= 0x3029)); |
708 | 387k | } |
709 | | |
710 | | Bool TY_(IsXMLNamechar)(uint c) |
711 | 331k | { |
712 | 331k | return (TY_(IsXMLLetter)(c) || |
713 | 331k | c == '.' || c == '_' || |
714 | 331k | c == ':' || c == '-' || |
715 | 331k | (c >= 0x300 && c <= 0x345) || |
716 | 331k | (c >= 0x360 && c <= 0x361) || |
717 | 331k | (c >= 0x483 && c <= 0x486) || |
718 | 331k | (c >= 0x591 && c <= 0x5a1) || |
719 | 331k | (c >= 0x5a3 && c <= 0x5b9) || |
720 | 331k | (c >= 0x5bb && c <= 0x5bd) || |
721 | 331k | c == 0x5bf || |
722 | 331k | (c >= 0x5c1 && c <= 0x5c2) || |
723 | 331k | c == 0x5c4 || |
724 | 331k | (c >= 0x64b && c <= 0x652) || |
725 | 331k | c == 0x670 || |
726 | 331k | (c >= 0x6d6 && c <= 0x6dc) || |
727 | 331k | (c >= 0x6dd && c <= 0x6df) || |
728 | 331k | (c >= 0x6e0 && c <= 0x6e4) || |
729 | 331k | (c >= 0x6e7 && c <= 0x6e8) || |
730 | 331k | (c >= 0x6ea && c <= 0x6ed) || |
731 | 331k | (c >= 0x901 && c <= 0x903) || |
732 | 331k | c == 0x93c || |
733 | 331k | (c >= 0x93e && c <= 0x94c) || |
734 | 331k | c == 0x94d || |
735 | 331k | (c >= 0x951 && c <= 0x954) || |
736 | 331k | (c >= 0x962 && c <= 0x963) || |
737 | 331k | (c >= 0x981 && c <= 0x983) || |
738 | 331k | c == 0x9bc || |
739 | 331k | c == 0x9be || |
740 | 331k | c == 0x9bf || |
741 | 331k | (c >= 0x9c0 && c <= 0x9c4) || |
742 | 331k | (c >= 0x9c7 && c <= 0x9c8) || |
743 | 331k | (c >= 0x9cb && c <= 0x9cd) || |
744 | 331k | c == 0x9d7 || |
745 | 331k | (c >= 0x9e2 && c <= 0x9e3) || |
746 | 331k | c == 0xa02 || |
747 | 331k | c == 0xa3c || |
748 | 331k | c == 0xa3e || |
749 | 331k | c == 0xa3f || |
750 | 331k | (c >= 0xa40 && c <= 0xa42) || |
751 | 331k | (c >= 0xa47 && c <= 0xa48) || |
752 | 331k | (c >= 0xa4b && c <= 0xa4d) || |
753 | 331k | (c >= 0xa70 && c <= 0xa71) || |
754 | 331k | (c >= 0xa81 && c <= 0xa83) || |
755 | 331k | c == 0xabc || |
756 | 331k | (c >= 0xabe && c <= 0xac5) || |
757 | 331k | (c >= 0xac7 && c <= 0xac9) || |
758 | 331k | (c >= 0xacb && c <= 0xacd) || |
759 | 331k | (c >= 0xb01 && c <= 0xb03) || |
760 | 331k | c == 0xb3c || |
761 | 331k | (c >= 0xb3e && c <= 0xb43) || |
762 | 331k | (c >= 0xb47 && c <= 0xb48) || |
763 | 331k | (c >= 0xb4b && c <= 0xb4d) || |
764 | 331k | (c >= 0xb56 && c <= 0xb57) || |
765 | 331k | (c >= 0xb82 && c <= 0xb83) || |
766 | 331k | (c >= 0xbbe && c <= 0xbc2) || |
767 | 331k | (c >= 0xbc6 && c <= 0xbc8) || |
768 | 331k | (c >= 0xbca && c <= 0xbcd) || |
769 | 331k | c == 0xbd7 || |
770 | 331k | (c >= 0xc01 && c <= 0xc03) || |
771 | 331k | (c >= 0xc3e && c <= 0xc44) || |
772 | 331k | (c >= 0xc46 && c <= 0xc48) || |
773 | 331k | (c >= 0xc4a && c <= 0xc4d) || |
774 | 331k | (c >= 0xc55 && c <= 0xc56) || |
775 | 331k | (c >= 0xc82 && c <= 0xc83) || |
776 | 331k | (c >= 0xcbe && c <= 0xcc4) || |
777 | 331k | (c >= 0xcc6 && c <= 0xcc8) || |
778 | 331k | (c >= 0xcca && c <= 0xccd) || |
779 | 331k | (c >= 0xcd5 && c <= 0xcd6) || |
780 | 331k | (c >= 0xd02 && c <= 0xd03) || |
781 | 331k | (c >= 0xd3e && c <= 0xd43) || |
782 | 331k | (c >= 0xd46 && c <= 0xd48) || |
783 | 331k | (c >= 0xd4a && c <= 0xd4d) || |
784 | 331k | c == 0xd57 || |
785 | 331k | c == 0xe31 || |
786 | 331k | (c >= 0xe34 && c <= 0xe3a) || |
787 | 331k | (c >= 0xe47 && c <= 0xe4e) || |
788 | 331k | c == 0xeb1 || |
789 | 331k | (c >= 0xeb4 && c <= 0xeb9) || |
790 | 331k | (c >= 0xebb && c <= 0xebc) || |
791 | 331k | (c >= 0xec8 && c <= 0xecd) || |
792 | 331k | (c >= 0xf18 && c <= 0xf19) || |
793 | 331k | c == 0xf35 || |
794 | 331k | c == 0xf37 || |
795 | 331k | c == 0xf39 || |
796 | 331k | c == 0xf3e || |
797 | 331k | c == 0xf3f || |
798 | 331k | (c >= 0xf71 && c <= 0xf84) || |
799 | 331k | (c >= 0xf86 && c <= 0xf8b) || |
800 | 331k | (c >= 0xf90 && c <= 0xf95) || |
801 | 331k | c == 0xf97 || |
802 | 331k | (c >= 0xf99 && c <= 0xfad) || |
803 | 331k | (c >= 0xfb1 && c <= 0xfb7) || |
804 | 331k | c == 0xfb9 || |
805 | 331k | (c >= 0x20d0 && c <= 0x20dc) || |
806 | 331k | c == 0x20e1 || |
807 | 331k | (c >= 0x302a && c <= 0x302f) || |
808 | 331k | c == 0x3099 || |
809 | 331k | c == 0x309a || |
810 | 331k | (c >= 0x30 && c <= 0x39) || |
811 | 331k | (c >= 0x660 && c <= 0x669) || |
812 | 331k | (c >= 0x6f0 && c <= 0x6f9) || |
813 | 331k | (c >= 0x966 && c <= 0x96f) || |
814 | 331k | (c >= 0x9e6 && c <= 0x9ef) || |
815 | 331k | (c >= 0xa66 && c <= 0xa6f) || |
816 | 331k | (c >= 0xae6 && c <= 0xaef) || |
817 | 331k | (c >= 0xb66 && c <= 0xb6f) || |
818 | 331k | (c >= 0xbe7 && c <= 0xbef) || |
819 | 331k | (c >= 0xc66 && c <= 0xc6f) || |
820 | 331k | (c >= 0xce6 && c <= 0xcef) || |
821 | 331k | (c >= 0xd66 && c <= 0xd6f) || |
822 | 331k | (c >= 0xe50 && c <= 0xe59) || |
823 | 331k | (c >= 0xed0 && c <= 0xed9) || |
824 | 331k | (c >= 0xf20 && c <= 0xf29) || |
825 | 331k | c == 0xb7 || |
826 | 331k | c == 0x2d0 || |
827 | 331k | c == 0x2d1 || |
828 | 331k | c == 0x387 || |
829 | 331k | c == 0x640 || |
830 | 331k | c == 0xe46 || |
831 | 331k | c == 0xec6 || |
832 | 331k | c == 0x3005 || |
833 | 331k | (c >= 0x3031 && c <= 0x3035) || |
834 | 331k | (c >= 0x309d && c <= 0x309e) || |
835 | 331k | (c >= 0x30fc && c <= 0x30fe)); |
836 | 331k | } |
837 | | |
838 | | Bool TY_(IsUpper)(uint c) |
839 | 11.1M | { |
840 | 11.1M | uint map = MAP(c); |
841 | | |
842 | 11.1M | return (map & uppercase)!=0; |
843 | 11.1M | } |
844 | | |
845 | | uint TY_(ToLower)(uint c) |
846 | 169M | { |
847 | 169M | uint map = MAP(c); |
848 | | |
849 | 169M | if (map & uppercase) |
850 | 8.94M | c += 'a' - 'A'; |
851 | | |
852 | 169M | return c; |
853 | 169M | } |
854 | | |
855 | | uint TY_(ToUpper)(uint c) |
856 | 186k | { |
857 | 186k | uint map = MAP(c); |
858 | | |
859 | 186k | if (map & lowercase) |
860 | 11.6k | c += (uint) ('A' - 'a' ); |
861 | | |
862 | 186k | return c; |
863 | 186k | } |
864 | | |
865 | | /* |
866 | | return last character in string |
867 | | this is useful when trailing quotemark |
868 | | is missing on an attribute |
869 | | */ |
870 | | static tmbchar LastChar( tmbstr str ) |
871 | 153k | { |
872 | 153k | if ( str && *str ) |
873 | 153k | { |
874 | 153k | int n = TY_(tmbstrlen)(str); |
875 | 153k | return str[n-1]; |
876 | 153k | } |
877 | 686 | return 0; |
878 | 153k | } |
879 | | |
880 | | Lexer* TY_(NewLexer)( TidyDocImpl* doc ) |
881 | 17.0k | { |
882 | 17.0k | Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) ); |
883 | | |
884 | 17.0k | if ( lexer != NULL ) |
885 | 17.0k | { |
886 | 17.0k | TidyClearMemory( lexer, sizeof(Lexer) ); |
887 | | |
888 | 17.0k | lexer->allocator = doc->allocator; |
889 | 17.0k | lexer->lines = 1; |
890 | 17.0k | lexer->columns = 1; |
891 | 17.0k | lexer->state = LEX_CONTENT; |
892 | | |
893 | 17.0k | lexer->versions = (VERS_ALL|VERS_PROPRIETARY); |
894 | 17.0k | lexer->doctype = VERS_UNKNOWN; |
895 | 17.0k | lexer->root = &doc->root; |
896 | 17.0k | } |
897 | 17.0k | return lexer; |
898 | 17.0k | } |
899 | | |
900 | | static Bool EndOfInput( TidyDocImpl* doc ) |
901 | 683k | { |
902 | 683k | assert( doc->docIn != NULL ); |
903 | 683k | return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) ); |
904 | 683k | } |
905 | | |
906 | | void TY_(FreeLexer)( TidyDocImpl* doc ) |
907 | 34.1k | { |
908 | 34.1k | Lexer *lexer = doc->lexer; |
909 | 34.1k | if ( lexer ) |
910 | 17.0k | { |
911 | 17.0k | TY_(FreeStyles)( doc ); |
912 | | |
913 | | /* See GetToken() */ |
914 | 17.0k | if ( lexer->pushed || lexer->itoken ) |
915 | 0 | { |
916 | 0 | if (lexer->pushed) |
917 | 0 | TY_(FreeNode)( doc, lexer->itoken ); |
918 | 0 | TY_(FreeNode)( doc, lexer->token ); |
919 | 0 | } |
920 | | |
921 | 49.6k | while ( lexer->istacksize > 0 ) |
922 | 32.6k | TY_(PopInline)( doc, NULL ); |
923 | | |
924 | 17.0k | TidyDocFree( doc, lexer->istack ); |
925 | 17.0k | TidyDocFree( doc, lexer->lexbuf ); |
926 | 17.0k | TidyDocFree( doc, lexer ); |
927 | 17.0k | doc->lexer = NULL; |
928 | 17.0k | } |
929 | 34.1k | } |
930 | | |
931 | | /* Lexer uses bigger memory chunks than pprint as |
932 | | ** it must hold the entire input document. not just |
933 | | ** the last line or three. |
934 | | */ |
935 | | static void AddByte( Lexer *lexer, tmbchar ch ) |
936 | 278M | { |
937 | 278M | if ( lexer->lexsize + 2 >= lexer->lexlength ) |
938 | 18.4k | { |
939 | 18.4k | tmbstr buf = NULL; |
940 | 18.4k | uint allocAmt = lexer->lexlength; |
941 | 18.4k | uint prev = allocAmt; /* Is. #761 */ |
942 | 36.8k | while ( lexer->lexsize + 2 >= allocAmt ) |
943 | 18.4k | { |
944 | 18.4k | if ( allocAmt == 0 ) |
945 | 17.0k | allocAmt = 8192; |
946 | 1.40k | else |
947 | 1.40k | allocAmt *= 2; |
948 | 18.4k | if (allocAmt < prev) /* Is. #761 - watch for wrap - and */ |
949 | 0 | TidyPanic(lexer->allocator, "\nPanic: out of internal memory!\nDocument input too big!\n"); |
950 | 18.4k | } |
951 | 18.4k | buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt ); |
952 | 18.4k | if ( buf ) |
953 | 18.4k | { |
954 | 18.4k | TidyClearMemory( buf + lexer->lexlength, |
955 | 18.4k | allocAmt - lexer->lexlength ); |
956 | 18.4k | lexer->lexbuf = buf; |
957 | 18.4k | lexer->lexlength = allocAmt; |
958 | 18.4k | } |
959 | 18.4k | } |
960 | | |
961 | 278M | lexer->lexbuf[ lexer->lexsize++ ] = ch; |
962 | 278M | lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */ |
963 | 278M | } |
964 | | |
965 | | static void ChangeChar( Lexer *lexer, tmbchar c ) |
966 | 1.07M | { |
967 | 1.07M | if ( lexer->lexsize > 0 ) |
968 | 1.07M | { |
969 | 1.07M | lexer->lexbuf[ lexer->lexsize-1 ] = c; |
970 | 1.07M | } |
971 | 1.07M | } |
972 | | |
973 | | /* store character c as UTF-8 encoded byte stream */ |
974 | | void TY_(AddCharToLexer)( Lexer *lexer, uint c ) |
975 | 274M | { |
976 | 274M | int i, err, count = 0; |
977 | 274M | tmbchar buf[10] = {0}; |
978 | | |
979 | 274M | err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count ); |
980 | 274M | if (err) |
981 | 5.50k | { |
982 | | /* replacement character 0xFFFD encoded as UTF-8 */ |
983 | 5.50k | buf[0] = (byte) 0xEF; |
984 | 5.50k | buf[1] = (byte) 0xBF; |
985 | 5.50k | buf[2] = (byte) 0xBD; |
986 | 5.50k | count = 3; |
987 | 5.50k | } |
988 | | |
989 | 552M | for ( i = 0; i < count; ++i ) |
990 | 278M | AddByte( lexer, buf[i] ); |
991 | 274M | } |
992 | | |
993 | | static void AddStringToLexer( Lexer *lexer, ctmbstr str ) |
994 | 0 | { |
995 | 0 | uint c; |
996 | | |
997 | | /* Many (all?) compilers will sign-extend signed chars (the default) when |
998 | | ** converting them to unsigned integer values. We must cast our char to |
999 | | ** unsigned char before assigning it to prevent this from happening. |
1000 | | */ |
1001 | 0 | while( 0 != (c = (unsigned char) *str++ )) |
1002 | 0 | TY_(AddCharToLexer)( lexer, c ); |
1003 | 0 | } |
1004 | | |
1005 | | |
1006 | | static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer ) |
1007 | 4.86M | { |
1008 | 4.86M | lexer->lines = doc->docIn->curline; |
1009 | 4.86M | lexer->columns = doc->docIn->curcol; |
1010 | 4.86M | } |
1011 | | |
1012 | | /* |
1013 | | Issue #483 |
1014 | | Have detected the first of a surrogate pair... |
1015 | | Try to find, decode the second... |
1016 | | Already have '&' start... |
1017 | | */ |
1018 | | |
1019 | | typedef enum { |
1020 | | SP_ok, |
1021 | | SP_failed, |
1022 | | SP_error |
1023 | | }SPStatus; |
1024 | | |
1025 | | static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch) |
1026 | 3.75k | { |
1027 | 3.75k | Lexer* lexer = doc->lexer; |
1028 | 3.75k | uint bufSize = 32; |
1029 | 3.75k | uint c, ch = 0, offset = 0; |
1030 | 3.75k | tmbstr buf = 0; |
1031 | 3.75k | SPStatus status = SP_error; /* assume failed */ |
1032 | 3.75k | int type = 0; /* assume numeric */ |
1033 | 3.75k | uint fch = *pch; |
1034 | 3.75k | int i; /* has to be signed due to for i >= 0 */ |
1035 | 3.75k | if (!lexer) |
1036 | 0 | return status; |
1037 | 3.75k | buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize); |
1038 | 3.75k | if (!buf) |
1039 | 0 | return status; |
1040 | 64.6k | while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream ) |
1041 | 64.5k | { |
1042 | 64.5k | if (c == ';') |
1043 | 979 | { |
1044 | 979 | break; /* reached end of entity */ |
1045 | 979 | } |
1046 | 63.5k | if ((offset + 2) > bufSize) |
1047 | 510 | { |
1048 | 510 | bufSize *= 2; |
1049 | 510 | buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize); |
1050 | 510 | if (!buf) |
1051 | 0 | { |
1052 | 0 | break; |
1053 | 0 | } |
1054 | 510 | } |
1055 | 63.5k | buf[offset++] = c; /* add char to buffer */ |
1056 | 63.5k | if (offset == 1) |
1057 | 3.45k | { |
1058 | 3.45k | if (c != '#') /* is a numeric entity */ |
1059 | 511 | break; |
1060 | 3.45k | } |
1061 | 60.1k | else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X'))) |
1062 | 2.23k | { |
1063 | 2.23k | type = 1; /* set hex digits */ |
1064 | 2.23k | } |
1065 | 57.8k | else |
1066 | 57.8k | { |
1067 | 57.8k | if (type) /* if hex digits */ |
1068 | 53.1k | { |
1069 | 53.1k | if (!IsDigitHex(c)) |
1070 | 1.49k | break; |
1071 | 53.1k | } |
1072 | 4.68k | else /* if numeric */ |
1073 | 4.68k | { |
1074 | 4.68k | if (!TY_(IsDigit)(c)) |
1075 | 692 | break; |
1076 | 4.68k | } |
1077 | 57.8k | } |
1078 | 63.5k | } |
1079 | | |
1080 | 3.75k | if (c == ';') |
1081 | 979 | { |
1082 | 979 | int scanned; |
1083 | | |
1084 | 979 | buf[offset] = 0; |
1085 | 979 | if (type) |
1086 | 687 | scanned = sscanf(buf + 2, "%x", &ch); |
1087 | 292 | else |
1088 | 292 | scanned = sscanf(buf + 1, "%d", &ch); |
1089 | | |
1090 | 979 | if (scanned == 1 && TY_(IsHighSurrogate)(ch)) |
1091 | 442 | { |
1092 | 442 | ch = TY_(CombineSurrogatePair)(ch, fch); |
1093 | 442 | if (TY_(IsValidCombinedChar)(ch)) |
1094 | 234 | { |
1095 | 234 | *pch = ch; /* return combined pair value */ |
1096 | 234 | status = SP_ok; /* full success - pair used */ |
1097 | 234 | } |
1098 | 208 | else |
1099 | 208 | { |
1100 | 208 | status = SP_failed; /* is one of the 32 out-of-range pairs */ |
1101 | 208 | *pch = 0xFFFD; /* return substitute character */ |
1102 | 208 | TY_(ReportSurrogateError)(doc, BAD_SURROGATE_PAIR, fch, ch); /* SP WARNING: - */ |
1103 | 208 | } |
1104 | 442 | } |
1105 | 979 | } |
1106 | | |
1107 | 3.75k | if (status == SP_error) |
1108 | 3.31k | { |
1109 | | /* Error condition - can only put back all the chars */ |
1110 | 3.31k | if (c == ';') /* if last, not added to buffer */ |
1111 | 537 | TY_(UngetChar)(c, doc->docIn); |
1112 | 3.31k | if (buf && offset) |
1113 | 3.01k | { |
1114 | | /* correct the order for unget - last first */ |
1115 | 63.9k | for (i = offset - 1; i >= 0; i--) |
1116 | 60.9k | { |
1117 | 60.9k | c = buf[i]; |
1118 | 60.9k | TY_(UngetChar)(c, doc->docIn); |
1119 | 60.9k | } |
1120 | 3.01k | } |
1121 | 3.31k | } |
1122 | | |
1123 | 3.75k | if (buf) |
1124 | 3.75k | TidyFree(lexer->allocator, buf); |
1125 | | |
1126 | 3.75k | return status; |
1127 | 3.75k | } |
1128 | | |
1129 | | /* |
1130 | | No longer attempts to insert missing ';' for unknown |
1131 | | entities unless one was present already, since this |
1132 | | gives unexpected results. |
1133 | | |
1134 | | For example: <a href="something.htm?foo&bar&fred"> |
1135 | | was tidied to: <a href="something.htm?foo&bar;&fred;"> |
1136 | | rather than: <a href="something.htm?foo&bar&fred"> |
1137 | | |
1138 | | My thanks for Maurice Buxton for spotting this. |
1139 | | |
1140 | | Also Randy Waki pointed out the following case for the |
1141 | | 04 Aug 00 version (bug #433012): |
1142 | | |
1143 | | For example: <a href="something.htm?id=1&lang=en"> |
1144 | | was tidied to: <a href="something.htm?id=1⟨=en"> |
1145 | | rather than: <a href="something.htm?id=1&lang=en"> |
1146 | | |
1147 | | where "lang" is a known entity (#9001), but browsers would |
1148 | | misinterpret "⟨" because it had a value > 256. |
1149 | | |
1150 | | So the case of an apparently known entity with a value > 256 and |
1151 | | missing a semicolon is handled specially. |
1152 | | |
1153 | | "ParseEntity" is also a bit of a misnomer - it handles entities and |
1154 | | numeric character references. Invalid NCR's are now reported. |
1155 | | */ |
1156 | | static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode ) |
1157 | 87.5k | { |
1158 | 87.5k | typedef enum |
1159 | 87.5k | { |
1160 | 87.5k | ENT_default, |
1161 | 87.5k | ENT_numdec, |
1162 | 87.5k | ENT_numhex |
1163 | 87.5k | } ENTState; |
1164 | | |
1165 | 87.5k | typedef Bool (*ENTfn)(uint); |
1166 | 87.5k | const ENTfn entFn[] = { |
1167 | 87.5k | TY_(IsNamechar), |
1168 | 87.5k | TY_(IsDigit), |
1169 | 87.5k | IsDigitHex |
1170 | 87.5k | }; |
1171 | 87.5k | uint start; |
1172 | 87.5k | ENTState entState = ENT_default; |
1173 | 87.5k | uint charRead = 0; |
1174 | 87.5k | Bool semicolon = no, found = no; |
1175 | 87.5k | Bool isXml = cfgBool( doc, TidyXmlTags ); |
1176 | 87.5k | Bool preserveEntities = cfgBool( doc, TidyPreserveEntities ); |
1177 | 87.5k | uint c, ch, startcol, entver = 0; |
1178 | 87.5k | Lexer* lexer = doc->lexer; |
1179 | | |
1180 | 87.5k | start = lexer->lexsize - 1; /* to start at "&" */ |
1181 | 87.5k | startcol = doc->docIn->curcol - 1; |
1182 | | |
1183 | 2.84M | while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream ) |
1184 | 2.84M | { |
1185 | 2.84M | if ( c == ';' ) |
1186 | 56.4k | { |
1187 | 56.4k | semicolon = yes; |
1188 | 56.4k | break; |
1189 | 56.4k | } |
1190 | 2.78M | ++charRead; |
1191 | | |
1192 | 2.78M | if (charRead == 1 && c == '#') |
1193 | 17.3k | { |
1194 | 17.3k | if ( !cfgBool(doc, TidyNCR) || |
1195 | 17.3k | cfg(doc, TidyInCharEncoding) == BIG5 || |
1196 | 17.3k | cfg(doc, TidyInCharEncoding) == SHIFTJIS ) |
1197 | 0 | { |
1198 | 0 | TY_(UngetChar)('#', doc->docIn); |
1199 | 0 | return; |
1200 | 0 | } |
1201 | | |
1202 | 17.3k | TY_(AddCharToLexer)( lexer, c ); |
1203 | 17.3k | entState = ENT_numdec; |
1204 | 17.3k | continue; |
1205 | 17.3k | } |
1206 | 2.76M | else if (charRead == 2 && entState == ENT_numdec |
1207 | 2.76M | && (c == 'x' || (!isXml && c == 'X')) ) |
1208 | 12.4k | { |
1209 | 12.4k | TY_(AddCharToLexer)( lexer, c ); |
1210 | 12.4k | entState = ENT_numhex; |
1211 | 12.4k | continue; |
1212 | 12.4k | } |
1213 | | |
1214 | 2.75M | if ( entFn[entState](c) ) |
1215 | 2.72M | { |
1216 | 2.72M | TY_(AddCharToLexer)( lexer, c ); |
1217 | 2.72M | continue; |
1218 | 2.72M | } |
1219 | | |
1220 | | /* otherwise put it back */ |
1221 | 30.5k | TY_(UngetChar)( c, doc->docIn ); |
1222 | 30.5k | break; |
1223 | 2.75M | } |
1224 | | |
1225 | | /* make sure entity is NULL terminated */ |
1226 | 87.5k | lexer->lexbuf[lexer->lexsize] = '\0'; |
1227 | | |
1228 | | /* Should contrain version to XML/XHTML if ' |
1229 | | ** is encountered. But this is not possible with |
1230 | | ** Tidy's content model bit mask. |
1231 | | */ |
1232 | 87.5k | if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0 |
1233 | 87.5k | && !cfgBool(doc, TidyXmlOut) |
1234 | 87.5k | && !lexer->isvoyager |
1235 | 87.5k | && !cfgBool(doc, TidyXhtmlOut) |
1236 | 87.5k | && !(TY_(HTMLVersion)(doc) == HT50) ) /* Issue #239 - no warning if in HTML5++ mode */ |
1237 | 0 | TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 ); |
1238 | | |
1239 | 87.5k | if (( mode == OtherNamespace ) && ( c == ';' )) |
1240 | 282 | { |
1241 | | /* #130 MathML attr and entity fix! */ |
1242 | 282 | found = yes; |
1243 | 282 | ch = 255; |
1244 | 282 | entver = XH50|HT50; |
1245 | 282 | preserveEntities = yes; |
1246 | 282 | } |
1247 | 87.2k | else |
1248 | 87.2k | { |
1249 | | /* Lookup entity code and version |
1250 | | */ |
1251 | 87.2k | found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver ); |
1252 | 87.2k | } |
1253 | | |
1254 | | /* Issue #483 - Deal with 'surrogate pairs' */ |
1255 | | /* TODO: Maybe warning/error, like found a leading surrogate |
1256 | | but no following surrogate! Maybe should avoid outputting |
1257 | | invalid utf-8 for this entity - maybe substitute? */ |
1258 | 87.5k | if (!preserveEntities && found && TY_(IsLowSurrogate)(ch)) |
1259 | 4.26k | { |
1260 | 4.26k | uint c1; |
1261 | 4.26k | if ((c1 = TY_(ReadChar)(doc->docIn)) == '&') |
1262 | 3.75k | { |
1263 | 3.75k | SPStatus status; |
1264 | | /* Have a following entity, |
1265 | | so there is a chance of having a valid surrogate pair */ |
1266 | 3.75k | c1 = ch; /* keep first value, in case of error */ |
1267 | 3.75k | status = GetSurrogatePair(doc, isXml, &ch); |
1268 | 3.75k | if (status == SP_error) |
1269 | 3.31k | { |
1270 | 3.31k | TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, c1, 0); /* SP WARNING: - using substitute character */ |
1271 | 3.31k | TY_(UngetChar)('&', doc->docIn); /* otherwise put it back */ |
1272 | 3.31k | } |
1273 | 3.75k | } |
1274 | 506 | else |
1275 | 506 | { |
1276 | | /* put this non-entity lead char back */ |
1277 | 506 | TY_(UngetChar)(c1, doc->docIn); |
1278 | | /* Have leading surrogate pair, with no tail */ |
1279 | 506 | TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, ch, 0); /* SP WARNING: - using substitute character */ |
1280 | 506 | ch = 0xFFFD; |
1281 | 506 | } |
1282 | 4.26k | } |
1283 | 83.2k | else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch)) |
1284 | 292 | { |
1285 | | /* Have trailing surrogate pair, with no lead */ |
1286 | 292 | TY_(ReportSurrogateError)(doc, BAD_SURROGATE_LEAD, ch, 0); /* SP WARNING: - using substitute character */ |
1287 | 292 | ch = 0xFFFD; |
1288 | 292 | } |
1289 | | |
1290 | | /* deal with unrecognized or invalid entities */ |
1291 | | /* #433012 - fix by Randy Waki 17 Feb 01 */ |
1292 | | /* report invalid NCR's - Terry Teague 01 Sep 01 */ |
1293 | 87.5k | if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') ) |
1294 | 78.0k | { |
1295 | | /* set error position just before offending character */ |
1296 | 78.0k | SetLexerLocus( doc, lexer ); |
1297 | 78.0k | lexer->columns = startcol; |
1298 | | |
1299 | 78.0k | if (lexer->lexsize > start + 1) |
1300 | 66.6k | { |
1301 | 66.6k | if (ch >= 128 && ch <= 159) |
1302 | 1.31k | { |
1303 | | /* invalid numeric character reference */ |
1304 | | |
1305 | 1.31k | uint c1 = 0; |
1306 | 1.31k | int replaceMode = DISCARDED_CHAR; |
1307 | | |
1308 | | /* Always assume Win1252 in this circumstance. */ |
1309 | 1.31k | c1 = TY_(DecodeWin1252)( ch ); |
1310 | | |
1311 | 1.31k | if ( c1 ) |
1312 | 528 | replaceMode = REPLACED_CHAR; |
1313 | | |
1314 | 1.31k | if ( c != ';' ) /* issue warning if not terminated by ';' */ |
1315 | 1.08k | TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR, |
1316 | 1.08k | lexer->lexbuf+start, c ); |
1317 | | |
1318 | 1.31k | TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR); |
1319 | | |
1320 | 1.31k | if ( c1 ) |
1321 | 528 | { |
1322 | | /* make the replacement */ |
1323 | 528 | lexer->lexsize = start; |
1324 | 528 | TY_(AddCharToLexer)( lexer, c1 ); |
1325 | 528 | semicolon = no; |
1326 | 528 | } |
1327 | 785 | else |
1328 | 785 | { |
1329 | | /* discard */ |
1330 | 785 | lexer->lexsize = start; |
1331 | 785 | semicolon = no; |
1332 | 785 | } |
1333 | | |
1334 | 1.31k | } |
1335 | 65.3k | else |
1336 | 65.3k | TY_(ReportEntityError)( doc, UNKNOWN_ENTITY, |
1337 | 65.3k | lexer->lexbuf+start, ch ); |
1338 | | |
1339 | 66.6k | if (semicolon) |
1340 | 52.2k | TY_(AddCharToLexer)( lexer, ';' ); |
1341 | 66.6k | } |
1342 | 11.4k | else |
1343 | 11.4k | { |
1344 | | /*\ |
1345 | | * Issue #207 - A naked & is allowed in HTML5, as an unambiguous ampersand! |
1346 | | \*/ |
1347 | 11.4k | if (TY_(HTMLVersion)(doc) != HT50) |
1348 | 11.4k | { |
1349 | 11.4k | TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND, |
1350 | 11.4k | lexer->lexbuf+start, ch ); |
1351 | 11.4k | } |
1352 | 11.4k | } |
1353 | 78.0k | } |
1354 | 9.47k | else |
1355 | 9.47k | { |
1356 | 9.47k | if ( c != ';' ) /* issue warning if not terminated by ';' */ |
1357 | 6.09k | { |
1358 | | /* set error position just before offending character */ |
1359 | 6.09k | SetLexerLocus( doc, lexer ); |
1360 | 6.09k | lexer->columns = startcol; |
1361 | 6.09k | TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c ); |
1362 | 6.09k | } |
1363 | | |
1364 | 9.47k | if (preserveEntities) |
1365 | 282 | TY_(AddCharToLexer)( lexer, ';' ); |
1366 | 9.19k | else |
1367 | 9.19k | { |
1368 | 9.19k | lexer->lexsize = start; |
1369 | 9.19k | if ( ch == 160 && (mode == Preformatted) ) |
1370 | 239 | ch = ' '; |
1371 | 9.19k | TY_(AddCharToLexer)( lexer, ch ); |
1372 | | |
1373 | 9.19k | if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) ) |
1374 | 0 | AddStringToLexer( lexer, "amp;" ); |
1375 | 9.19k | } |
1376 | | |
1377 | | /* Detect extended vs. basic entities */ |
1378 | 9.47k | TY_(ConstrainVersion)( doc, entver ); |
1379 | 9.47k | } |
1380 | 87.5k | } |
1381 | | |
1382 | | static tmbchar ParseTagName( TidyDocImpl* doc ) |
1383 | 1.11M | { |
1384 | 1.11M | Lexer *lexer = doc->lexer; |
1385 | 1.11M | uint c = lexer->lexbuf[ lexer->txtstart ]; |
1386 | 1.11M | Bool xml = cfgBool(doc, TidyXmlTags); |
1387 | | |
1388 | | /* fold case of first character in buffer */ |
1389 | 1.11M | if (!xml && TY_(IsUpper)(c)) |
1390 | 562k | lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c); |
1391 | | |
1392 | 10.0M | while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) |
1393 | 10.0M | { |
1394 | 10.0M | if ((!xml && !TY_(IsNamechar)(c)) || |
1395 | 10.0M | (xml && !TY_(IsXMLNamechar)(c))) |
1396 | 1.10M | break; |
1397 | | |
1398 | | /* fold case of subsequent characters */ |
1399 | 8.95M | if (!xml && TY_(IsUpper)(c)) |
1400 | 5.45M | c = TY_(ToLower)(c); |
1401 | | |
1402 | 8.95M | TY_(AddCharToLexer)(lexer, c); |
1403 | 8.95M | } |
1404 | | |
1405 | 1.11M | lexer->txtend = lexer->lexsize; |
1406 | 1.11M | return (tmbchar) c; |
1407 | 1.11M | } |
1408 | | |
1409 | | /* |
1410 | | Used for elements and text nodes |
1411 | | element name is NULL for text nodes |
1412 | | start and end are offsets into lexbuf |
1413 | | which contains the textual content of |
1414 | | all elements in the parse tree. |
1415 | | |
1416 | | parent and content allow traversal |
1417 | | of the parse tree in any direction. |
1418 | | attributes are represented as a linked |
1419 | | list of AttVal nodes which hold the |
1420 | | strings for attribute/value pairs. |
1421 | | */ |
1422 | | |
1423 | | |
1424 | | Node *TY_(NewNode)(TidyAllocator* allocator, Lexer *lexer) |
1425 | 3.33M | { |
1426 | 3.33M | Node* node = (Node*) TidyAlloc( allocator, sizeof(Node) ); |
1427 | 3.33M | TidyClearMemory( node, sizeof(Node) ); |
1428 | 3.33M | if ( lexer ) |
1429 | 3.33M | { |
1430 | 3.33M | node->line = lexer->lines; |
1431 | 3.33M | node->column = lexer->columns; |
1432 | 3.33M | } |
1433 | 3.33M | node->type = TextNode; |
1434 | | #if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION) |
1435 | | SPRTF("Allocated node %p\n", node ); |
1436 | | #endif |
1437 | 3.33M | return node; |
1438 | 3.33M | } |
1439 | | |
1440 | | /* used to clone heading nodes when split by an <HR> */ |
1441 | | Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element ) |
1442 | 3.50k | { |
1443 | 3.50k | Lexer* lexer = doc->lexer; |
1444 | 3.50k | Node *node = TY_(NewNode)( lexer->allocator, lexer ); |
1445 | | |
1446 | 3.50k | node->start = lexer->lexsize; |
1447 | 3.50k | node->end = lexer->lexsize; |
1448 | | |
1449 | 3.50k | if ( element ) |
1450 | 3.50k | { |
1451 | 3.50k | node->parent = element->parent; |
1452 | 3.50k | node->type = element->type; |
1453 | 3.50k | node->closed = element->closed; |
1454 | 3.50k | node->implicit = element->implicit; |
1455 | 3.50k | node->tag = element->tag; |
1456 | 3.50k | node->element = TY_(tmbstrdup)( doc->allocator, element->element ); |
1457 | 3.50k | node->attributes = TY_(DupAttrs)( doc, element->attributes ); |
1458 | 3.50k | } |
1459 | 3.50k | return node; |
1460 | 3.50k | } |
1461 | | |
1462 | | /* free node's attributes */ |
1463 | | void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ) |
1464 | 3.21M | { |
1465 | 3.52M | while ( node->attributes ) |
1466 | 319k | { |
1467 | 319k | AttVal *av = node->attributes; |
1468 | | |
1469 | 319k | if ( av->attribute ) |
1470 | 313k | { |
1471 | 313k | if ( (attrIsID(av) || attrIsNAME(av)) && |
1472 | 313k | TY_(IsAnchorElement)(doc, node) ) |
1473 | 7.91k | { |
1474 | 7.91k | TY_(RemoveAnchorByNode)( doc, av->value, node ); |
1475 | 7.91k | } |
1476 | 313k | } |
1477 | | |
1478 | 319k | node->attributes = av->next; |
1479 | 319k | TY_(FreeAttribute)( doc, av ); |
1480 | 319k | } |
1481 | 3.21M | } |
1482 | | |
1483 | | /* doesn't repair attribute list linkage */ |
1484 | | void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ) |
1485 | 491k | { |
1486 | 491k | TY_(FreeNode)( doc, av->asp ); |
1487 | 491k | TY_(FreeNode)( doc, av->php ); |
1488 | 491k | TidyDocFree( doc, av->attribute ); |
1489 | 491k | TidyDocFree( doc, av->value ); |
1490 | 491k | TidyDocFree( doc, av ); |
1491 | 491k | } |
1492 | | |
1493 | | /* detach attribute from node |
1494 | | */ |
1495 | | void TY_(DetachAttribute)( Node *node, AttVal *attr ) |
1496 | 8.86k | { |
1497 | 8.86k | AttVal *av, *prev = NULL; |
1498 | | |
1499 | 18.2k | for ( av = node->attributes; av; av = av->next ) |
1500 | 18.2k | { |
1501 | 18.2k | if ( av == attr ) |
1502 | 8.86k | { |
1503 | 8.86k | if ( prev ) |
1504 | 4.56k | prev->next = attr->next; |
1505 | 4.29k | else |
1506 | 4.29k | node->attributes = attr->next; |
1507 | 8.86k | break; |
1508 | 8.86k | } |
1509 | 9.43k | prev = av; |
1510 | 9.43k | } |
1511 | 8.86k | } |
1512 | | |
1513 | | /* detach attribute from node then free it |
1514 | | */ |
1515 | | void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ) |
1516 | 8.86k | { |
1517 | 8.86k | TY_(DetachAttribute)( node, attr ); |
1518 | 8.86k | TY_(FreeAttribute)( doc, attr ); |
1519 | 8.86k | } |
1520 | | |
1521 | | /* |
1522 | | Free document nodes by iterating through peers and recursing |
1523 | | through children. Set next to NULL before calling TY_(FreeNode)() |
1524 | | to avoid freeing peer nodes. Doesn't patch up prev/next links. |
1525 | | */ |
1526 | | void TY_(FreeNode)( TidyDocImpl* doc, Node *node ) |
1527 | 5.38M | { |
1528 | | #if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION) |
1529 | | /* avoid showing free of root node! */ |
1530 | | if (node) { |
1531 | | if (RootNode != node->type) { |
1532 | | SPRTF("Free node %p\n", node); |
1533 | | } |
1534 | | else { |
1535 | | SPRTF("Root node %p\n", node); |
1536 | | } |
1537 | | } |
1538 | | #endif |
1539 | | |
1540 | 8.59M | while ( node ) |
1541 | 3.20M | { |
1542 | 3.20M | Node* next = node->next; |
1543 | | |
1544 | 3.20M | TY_(FreeAttrs)( doc, node ); |
1545 | 3.20M | TY_(FreeNode)( doc, node->content ); |
1546 | 3.20M | TidyDocFree( doc, node->element ); |
1547 | 3.20M | if (RootNode != node->type) |
1548 | 3.20M | TidyDocFree( doc, node ); |
1549 | 34.1k | else |
1550 | 34.1k | node->content = NULL; |
1551 | | |
1552 | 3.20M | node = next; |
1553 | 3.20M | } |
1554 | 5.38M | } |
1555 | | |
1556 | | Node* TY_(TextToken)( Lexer *lexer ) |
1557 | 187k | { |
1558 | 187k | Node *node = TY_(NewNode)( lexer->allocator, lexer ); |
1559 | 187k | node->start = lexer->txtstart; |
1560 | 187k | node->end = lexer->txtend; |
1561 | 187k | return node; |
1562 | 187k | } |
1563 | | |
1564 | | /* used for creating preformatted text from Word2000 */ |
1565 | | Node *TY_(NewLineNode)( Lexer *lexer ) |
1566 | 0 | { |
1567 | 0 | Node *node = TY_(NewNode)( lexer->allocator, lexer ); |
1568 | 0 | node->start = lexer->lexsize; |
1569 | 0 | TY_(AddCharToLexer)( lexer, (uint)'\n' ); |
1570 | 0 | node->end = lexer->lexsize; |
1571 | 0 | return node; |
1572 | 0 | } |
1573 | | |
1574 | | /* used for adding a for Word2000 */ |
1575 | | Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt ) |
1576 | 0 | { |
1577 | 0 | Node *node = TY_(NewNode)( lexer->allocator, lexer ); |
1578 | 0 | node->start = lexer->lexsize; |
1579 | 0 | AddStringToLexer( lexer, txt ); |
1580 | 0 | node->end = lexer->lexsize; |
1581 | 0 | return node; |
1582 | 0 | } |
1583 | | |
1584 | | static Node* TagToken( TidyDocImpl* doc, NodeType type ) |
1585 | 1.11M | { |
1586 | 1.11M | Lexer* lexer = doc->lexer; |
1587 | 1.11M | Node* node = TY_(NewNode)( lexer->allocator, lexer ); |
1588 | 1.11M | node->type = type; |
1589 | 1.11M | node->element = TY_(tmbstrndup)( doc->allocator, |
1590 | 1.11M | lexer->lexbuf + lexer->txtstart, |
1591 | 1.11M | lexer->txtend - lexer->txtstart ); |
1592 | 1.11M | node->start = lexer->txtstart; |
1593 | 1.11M | node->end = lexer->txtstart; |
1594 | | |
1595 | 1.11M | if ( type == StartTag || type == StartEndTag || type == EndTag ) |
1596 | 1.11M | TY_(FindTag)(doc, node); |
1597 | | |
1598 | 1.11M | return node; |
1599 | 1.11M | } |
1600 | | |
1601 | | static Node* NewToken(TidyDocImpl* doc, NodeType type) |
1602 | 36.8k | { |
1603 | 36.8k | Lexer* lexer = doc->lexer; |
1604 | 36.8k | Node* node = TY_(NewNode)(lexer->allocator, lexer); |
1605 | 36.8k | node->type = type; |
1606 | 36.8k | node->start = lexer->txtstart; |
1607 | 36.8k | node->end = lexer->txtend; |
1608 | 36.8k | return node; |
1609 | 36.8k | } |
1610 | | |
1611 | 4.67k | #define CommentToken(doc) NewToken(doc, CommentTag) |
1612 | | #define DocTypeToken(doc) NewToken(doc, DocTypeTag) |
1613 | 15.9k | #define PIToken(doc) NewToken(doc, ProcInsTag) |
1614 | 2.96k | #define AspToken(doc) NewToken(doc, AspTag) |
1615 | 837 | #define JsteToken(doc) NewToken(doc, JsteTag) |
1616 | 3.95k | #define PhpToken(doc) NewToken(doc, PhpTag) |
1617 | 4.88k | #define XmlDeclToken(doc) NewToken(doc, XmlDecl) |
1618 | 2.56k | #define SectionToken(doc) NewToken(doc, SectionTag) |
1619 | 1.07k | #define CDATAToken(doc) NewToken(doc, CDATATag) |
1620 | | |
1621 | | void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ) |
1622 | 0 | { |
1623 | 0 | byte c; |
1624 | 0 | while(0 != (c = *str++) ) { |
1625 | | /*\ |
1626 | | * Issue #286 |
1627 | | * Previously this used TY_(AddCharToLexer)( lexer, c ); |
1628 | | * which uses err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count ); |
1629 | | * But this is transferring already 'translated' data from an |
1630 | | * internal location to the lexer, so should use AddByte() |
1631 | | \*/ |
1632 | 0 | AddByte( lexer, c ); |
1633 | 0 | } |
1634 | 0 | } |
1635 | | |
1636 | | /* |
1637 | | void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ) |
1638 | | { |
1639 | | byte c; |
1640 | | int ix; |
1641 | | |
1642 | | for ( ix=0; ix < len && (c = *str++); ++ix ) |
1643 | | TY_(AddCharToLexer)(lexer, c); |
1644 | | } |
1645 | | */ |
1646 | | |
1647 | | /* find doctype element */ |
1648 | | Node *TY_(FindDocType)( TidyDocImpl* doc ) |
1649 | 18.0k | { |
1650 | 18.0k | Node* node; |
1651 | 18.0k | for ( node = (doc ? doc->root.content : NULL); |
1652 | 20.0k | node && node->type != DocTypeTag; |
1653 | 18.0k | node = node->next ) |
1654 | 2.02k | /**/; |
1655 | 18.0k | return node; |
1656 | 18.0k | } |
1657 | | |
1658 | | /* find parent container element */ |
1659 | | Node* TY_(FindContainer)( Node* node ) |
1660 | 0 | { |
1661 | 0 | for ( node = (node ? node->parent : NULL); |
1662 | 0 | node && TY_(nodeHasCM)(node, CM_INLINE); |
1663 | 0 | node = node->parent ) |
1664 | 0 | /**/; |
1665 | |
|
1666 | 0 | return node; |
1667 | 0 | } |
1668 | | |
1669 | | |
1670 | | /* find html element */ |
1671 | | Node *TY_(FindHTML)( TidyDocImpl* doc ) |
1672 | 58.3k | { |
1673 | 58.3k | Node *node; |
1674 | 58.3k | for ( node = (doc ? doc->root.content : NULL); |
1675 | 109k | node && !nodeIsHTML(node); |
1676 | 58.3k | node = node->next ) |
1677 | 51.4k | /**/; |
1678 | | |
1679 | 58.3k | return node; |
1680 | 58.3k | } |
1681 | | |
1682 | | /* find XML Declaration */ |
1683 | | Node *TY_(FindXmlDecl)(TidyDocImpl* doc) |
1684 | 1.14k | { |
1685 | 1.14k | Node *node; |
1686 | 1.14k | for ( node = (doc ? doc->root.content : NULL); |
1687 | 1.41k | node && !(node->type == XmlDecl); |
1688 | 1.14k | node = node->next ) |
1689 | 269 | /**/; |
1690 | | |
1691 | 1.14k | return node; |
1692 | 1.14k | } |
1693 | | |
1694 | | |
1695 | | Node *TY_(FindHEAD)( TidyDocImpl* doc ) |
1696 | 40.7k | { |
1697 | 40.7k | Node *node = TY_(FindHTML)( doc ); |
1698 | | |
1699 | 40.7k | if ( node ) |
1700 | 40.7k | { |
1701 | 40.7k | for ( node = node->content; |
1702 | 41.2k | node && !nodeIsHEAD(node); |
1703 | 40.7k | node = node->next ) |
1704 | 538 | /**/; |
1705 | 40.7k | } |
1706 | | |
1707 | 40.7k | return node; |
1708 | 40.7k | } |
1709 | | |
1710 | | Node *TY_(FindTITLE)(TidyDocImpl* doc) |
1711 | 17.0k | { |
1712 | 17.0k | Node *node = TY_(FindHEAD)(doc); |
1713 | | |
1714 | 17.0k | if (node) |
1715 | 17.0k | for (node = node->content; |
1716 | 49.3k | node && !nodeIsTITLE(node); |
1717 | 32.3k | node = node->next) {} |
1718 | | |
1719 | 17.0k | return node; |
1720 | 17.0k | } |
1721 | | |
1722 | | Node *TY_(FindBody)( TidyDocImpl* doc ) |
1723 | 16.2k | { |
1724 | 16.2k | Node *node = ( doc ? doc->root.content : NULL ); |
1725 | | |
1726 | 20.5k | while ( node && !nodeIsHTML(node) ) |
1727 | 4.32k | node = node->next; |
1728 | | |
1729 | 16.2k | if (node == NULL) |
1730 | 0 | return NULL; |
1731 | | |
1732 | 16.2k | node = node->content; |
1733 | 38.7k | while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) ) |
1734 | 22.5k | node = node->next; |
1735 | | |
1736 | 16.2k | if ( node && nodeIsFRAMESET(node) ) |
1737 | 7.42k | { |
1738 | 7.42k | node = node->content; |
1739 | 9.98k | while ( node && !nodeIsNOFRAMES(node) ) |
1740 | 2.56k | node = node->next; |
1741 | | |
1742 | 7.42k | if ( node ) |
1743 | 6.53k | { |
1744 | 6.53k | node = node->content; |
1745 | 7.03k | while ( node && !nodeIsBODY(node) ) |
1746 | 494 | node = node->next; |
1747 | 6.53k | } |
1748 | 7.42k | } |
1749 | | |
1750 | 16.2k | return node; |
1751 | 16.2k | } |
1752 | | |
1753 | | /* add meta element for Tidy */ |
1754 | | Bool TY_(AddGenerator)( TidyDocImpl* doc ) |
1755 | 0 | { |
1756 | 0 | AttVal *attval; |
1757 | 0 | Node *node; |
1758 | 0 | Node *head = TY_(FindHEAD)( doc ); |
1759 | 0 | tmbchar buf[256]; |
1760 | | |
1761 | 0 | if (head) |
1762 | 0 | { |
1763 | 0 | #ifdef PLATFORM_NAME |
1764 | 0 | TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 for "PLATFORM_NAME" version %s", |
1765 | 0 | tidyLibraryVersion()); |
1766 | | #else |
1767 | | TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 version %s", tidyLibraryVersion()); |
1768 | | #endif |
1769 | |
|
1770 | 0 | for ( node = head->content; node; node = node->next ) |
1771 | 0 | { |
1772 | 0 | if ( nodeIsMETA(node) ) |
1773 | 0 | { |
1774 | 0 | attval = TY_(AttrGetById)(node, TidyAttr_NAME); |
1775 | |
|
1776 | 0 | if (AttrValueIs(attval, "generator")) |
1777 | 0 | { |
1778 | 0 | attval = TY_(AttrGetById)(node, TidyAttr_CONTENT); |
1779 | |
|
1780 | 0 | if (AttrHasValue(attval) && |
1781 | 0 | TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0) |
1782 | 0 | { |
1783 | | /* update the existing content to reflect the */ |
1784 | | /* actual version of Tidy currently being used */ |
1785 | | |
1786 | 0 | TidyDocFree(doc, attval->value); |
1787 | 0 | attval->value = TY_(tmbstrdup)(doc->allocator, buf); |
1788 | 0 | return no; |
1789 | 0 | } |
1790 | 0 | } |
1791 | 0 | } |
1792 | 0 | } |
1793 | | |
1794 | 0 | if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) |
1795 | 0 | { |
1796 | 0 | node = TY_(InferredTag)(doc, TidyTag_META); |
1797 | 0 | TY_(AddAttribute)( doc, node, "name", "generator" ); |
1798 | 0 | TY_(AddAttribute)( doc, node, "content", buf ); |
1799 | 0 | TY_(InsertNodeAtStart)( head, node ); |
1800 | 0 | return yes; |
1801 | 0 | } |
1802 | 0 | } |
1803 | | |
1804 | 0 | return no; |
1805 | 0 | } |
1806 | | |
1807 | | /*\ examine <!DOCTYPE ...> to identify version |
1808 | | * Issue #167 and #169 |
1809 | | * If HTML5 |
1810 | | * <!DOCTYPE html> |
1811 | | * <!DOCTYPE html SYSTEM "about:legacy-compat"> |
1812 | | * else others |
1813 | | \*/ |
1814 | | static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype ) |
1815 | 10.9k | { |
1816 | 10.9k | AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC"); |
1817 | 10.9k | uint vers; |
1818 | | |
1819 | 10.9k | if (!fpi || !fpi->value) |
1820 | 8.74k | { |
1821 | | /*\ |
1822 | | * Is. #815 - change to case-insensitive test |
1823 | | * See REC: https://www.w3.org/TR/html5/syntax.html#the-doctype |
1824 | | \*/ |
1825 | 8.74k | if (doctype->element && (TY_(tmbstrcasecmp)(doctype->element,"html") == 0)) |
1826 | 105 | { |
1827 | 105 | return VERS_HTML5; /* TODO: do we need to check MORE? */ |
1828 | 105 | } |
1829 | | /* TODO: Consider warning, error message */ |
1830 | 8.63k | return VERS_UNKNOWN; |
1831 | 8.74k | } |
1832 | 2.20k | vers = GetVersFromFPI(fpi->value); |
1833 | | |
1834 | 2.20k | if (VERS_XHTML & vers) |
1835 | 80 | { |
1836 | 80 | TY_(SetOptionBool)(doc, TidyXmlOut, yes); |
1837 | 80 | TY_(SetOptionBool)(doc, TidyXhtmlOut, yes); |
1838 | 80 | doc->lexer->isvoyager = yes; |
1839 | 80 | } |
1840 | | |
1841 | | /* todo: add a warning if case does not match? */ |
1842 | 2.20k | TidyDocFree(doc, fpi->value); |
1843 | 2.20k | fpi->value = TY_(tmbstrdup)(doc->allocator, GetFPIFromVers(vers)); |
1844 | | |
1845 | 2.20k | return vers; |
1846 | 10.9k | } |
1847 | | |
1848 | | /* return guessed version */ |
1849 | | uint TY_(ApparentVersion)( TidyDocImpl* doc ) |
1850 | 0 | { |
1851 | 0 | if ((doc->lexer->doctype == XH11 || |
1852 | 0 | doc->lexer->doctype == XB10) && |
1853 | 0 | (doc->lexer->versions & doc->lexer->doctype)) |
1854 | 0 | return doc->lexer->doctype; |
1855 | 0 | else |
1856 | 0 | return TY_(HTMLVersion)(doc); |
1857 | 0 | } |
1858 | | |
1859 | | ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) ) |
1860 | 0 | { |
1861 | 0 | ctmbstr name = GetNameFromVers(vers); |
1862 | 0 | return name; |
1863 | 0 | } |
1864 | | |
1865 | | uint TY_(HTMLVersionNumberFromCode)( uint vers ) |
1866 | 0 | { |
1867 | 0 | uint i; |
1868 | |
|
1869 | 0 | for (i = 0; W3C_Doctypes[i].name; ++i) |
1870 | 0 | if (W3C_Doctypes[i].vers == vers) |
1871 | 0 | return W3C_Doctypes[i].vers_out; |
1872 | | |
1873 | 0 | return VERS_UNKNOWN; |
1874 | 0 | } |
1875 | | |
1876 | | Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ) |
1877 | 0 | { |
1878 | 0 | Bool isXhtml = doc->lexer->isvoyager; |
1879 | 0 | Node* doctype; |
1880 | | |
1881 | | /* Do not warn in XHTML mode */ |
1882 | 0 | if ( isXhtml ) |
1883 | 0 | return no; |
1884 | | |
1885 | | /* Do not warn if emitted doctype is proprietary */ |
1886 | 0 | if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL ) |
1887 | 0 | return no; |
1888 | | |
1889 | | /* Do not warn if no SI is possible */ |
1890 | 0 | if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL ) |
1891 | 0 | return no; |
1892 | | |
1893 | 0 | if ( (doctype = TY_(FindDocType)( doc )) != NULL |
1894 | 0 | && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL ) |
1895 | 0 | return yes; |
1896 | | |
1897 | 0 | return no; |
1898 | 0 | } |
1899 | | |
1900 | | |
1901 | | /* Put DOCTYPE declaration between the |
1902 | | ** <?xml version "1.0" ... ?> declaration, if any, |
1903 | | ** and the <html> tag. Should also work for any comments, |
1904 | | ** etc. that may precede the <html> tag. |
1905 | | */ |
1906 | | |
1907 | | static Node* NewDocTypeNode( TidyDocImpl* doc ) |
1908 | 0 | { |
1909 | 0 | Node* doctype = NULL; |
1910 | 0 | Node* html = TY_(FindHTML)( doc ); |
1911 | |
|
1912 | 0 | if ( !html ) |
1913 | 0 | return NULL; |
1914 | | |
1915 | 0 | doctype = TY_(NewNode)( doc->allocator, NULL ); |
1916 | 0 | doctype->type = DocTypeTag; |
1917 | 0 | TY_(InsertNodeBeforeElement)(html, doctype); |
1918 | 0 | return doctype; |
1919 | 0 | } |
1920 | | |
1921 | | Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ) |
1922 | 0 | { |
1923 | 0 | Lexer *lexer = doc->lexer; |
1924 | 0 | Node *doctype = TY_(FindDocType)( doc ); |
1925 | 0 | TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode); |
1926 | 0 | ctmbstr pub = "PUBLIC"; |
1927 | 0 | ctmbstr sys = "SYSTEM"; |
1928 | |
|
1929 | 0 | lexer->versionEmitted = TY_(ApparentVersion)( doc ); |
1930 | |
|
1931 | 0 | if (dtmode == TidyDoctypeOmit) |
1932 | 0 | { |
1933 | 0 | if (doctype) |
1934 | 0 | TY_(DiscardElement)(doc, doctype); |
1935 | 0 | return yes; |
1936 | 0 | } |
1937 | | |
1938 | 0 | if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype)) |
1939 | 0 | return no; |
1940 | | |
1941 | 0 | if (!doctype) |
1942 | 0 | { |
1943 | 0 | doctype = NewDocTypeNode(doc); |
1944 | 0 | doctype->element = TY_(tmbstrdup)(doc->allocator, "html"); |
1945 | 0 | } |
1946 | 0 | else |
1947 | 0 | { |
1948 | 0 | doctype->element = TY_(tmbstrtolower)(doctype->element); |
1949 | 0 | } |
1950 | |
|
1951 | 0 | switch(dtmode) |
1952 | 0 | { |
1953 | 0 | case TidyDoctypeHtml5: |
1954 | | /* HTML5 */ |
1955 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, NULL); |
1956 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, NULL); |
1957 | 0 | lexer->versionEmitted = XH50; |
1958 | 0 | break; |
1959 | 0 | case TidyDoctypeStrict: |
1960 | | /* XHTML 1.0 Strict */ |
1961 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S)); |
1962 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S)); |
1963 | 0 | lexer->versionEmitted = X10S; |
1964 | 0 | break; |
1965 | 0 | case TidyDoctypeLoose: |
1966 | | /* XHTML 1.0 Transitional */ |
1967 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T)); |
1968 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T)); |
1969 | 0 | lexer->versionEmitted = X10T; |
1970 | 0 | break; |
1971 | 0 | case TidyDoctypeUser: |
1972 | | /* user defined document type declaration */ |
1973 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype)); |
1974 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, ""); |
1975 | 0 | break; |
1976 | 0 | case TidyDoctypeAuto: |
1977 | 0 | if (lexer->doctype == VERS_UNKNOWN || lexer->doctype == VERS_HTML5) { |
1978 | 0 | lexer->versionEmitted = XH50; |
1979 | 0 | return yes; |
1980 | 0 | } |
1981 | 0 | else if (lexer->versions & XH11 && lexer->doctype == XH11) |
1982 | 0 | { |
1983 | 0 | if (!TY_(GetAttrByName)(doctype, sys)) |
1984 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11)); |
1985 | 0 | lexer->versionEmitted = XH11; |
1986 | 0 | return yes; |
1987 | 0 | } |
1988 | 0 | else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40)) |
1989 | 0 | { |
1990 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11)); |
1991 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11)); |
1992 | 0 | lexer->versionEmitted = XH11; |
1993 | 0 | } |
1994 | 0 | else if (lexer->versions & XB10 && lexer->doctype == XB10) |
1995 | 0 | { |
1996 | 0 | if (!TY_(GetAttrByName)(doctype, sys)) |
1997 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10)); |
1998 | 0 | lexer->versionEmitted = XB10; |
1999 | 0 | return yes; |
2000 | 0 | } |
2001 | 0 | else if (lexer->versions & VERS_HTML40_STRICT) |
2002 | 0 | { |
2003 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S)); |
2004 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S)); |
2005 | 0 | lexer->versionEmitted = X10S; |
2006 | 0 | } |
2007 | 0 | else if (lexer->versions & VERS_FRAMESET) |
2008 | 0 | { |
2009 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F)); |
2010 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F)); |
2011 | 0 | lexer->versionEmitted = X10F; |
2012 | 0 | } |
2013 | 0 | else if (lexer->versions & VERS_LOOSE) |
2014 | 0 | { |
2015 | 0 | TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T)); |
2016 | 0 | TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T)); |
2017 | 0 | lexer->versionEmitted = X10T; |
2018 | 0 | } |
2019 | 0 | else if (lexer->versions & VERS_HTML5) |
2020 | 0 | { |
2021 | | /*\ |
2022 | | * Issue #273 - If still a html5/xhtml5 bit |
2023 | | * existing, that is the 'ConstrainVersion' has |
2024 | | * not eliminated all HTML5, then nothing to do here. |
2025 | | * Certainly do **not** delete the DocType node! |
2026 | | * see: http://www.w3.org/QA/Tips/Doctype |
2027 | | \*/ |
2028 | 0 | } |
2029 | 0 | else |
2030 | 0 | { |
2031 | 0 | if (doctype) |
2032 | 0 | TY_(DiscardElement)(doc, doctype); |
2033 | 0 | return no; |
2034 | 0 | } |
2035 | 0 | break; |
2036 | 0 | case TidyDoctypeOmit: |
2037 | 0 | assert(0); |
2038 | 0 | break; |
2039 | 0 | } |
2040 | | |
2041 | 0 | return no; |
2042 | 0 | } |
2043 | | |
2044 | | /* fixup doctype if missing */ |
2045 | | Bool TY_(FixDocType)( TidyDocImpl* doc ) |
2046 | 0 | { |
2047 | 0 | Lexer* lexer = doc->lexer; |
2048 | 0 | Node* doctype = TY_(FindDocType)( doc ); |
2049 | 0 | uint dtmode = cfg( doc, TidyDoctypeMode ); |
2050 | 0 | uint guessed = VERS_UNKNOWN; |
2051 | 0 | Bool hadSI = no; |
2052 | | |
2053 | | /* Issue #167 - found doctype, and doctype is default VERS_HTML5, set VERS_HTML5 and return yes */ |
2054 | 0 | if (doctype && (dtmode == TidyDoctypeAuto) && |
2055 | 0 | (lexer->doctype == VERS_HTML5) ) |
2056 | 0 | { |
2057 | | /* The version emitted cannot be a composite value! */ |
2058 | 0 | lexer->versionEmitted = HT50; |
2059 | 0 | return yes; |
2060 | 0 | } |
2061 | 0 | if (dtmode == TidyDoctypeAuto && |
2062 | 0 | lexer->versions & lexer->doctype && |
2063 | 0 | !(VERS_XHTML & lexer->doctype && !lexer->isvoyager) |
2064 | 0 | && TY_(FindDocType)(doc)) |
2065 | 0 | { |
2066 | 0 | lexer->versionEmitted = lexer->doctype; |
2067 | 0 | return yes; |
2068 | 0 | } |
2069 | | |
2070 | 0 | if (dtmode == TidyDoctypeOmit) |
2071 | 0 | { |
2072 | 0 | if (doctype) |
2073 | 0 | TY_(DiscardElement)( doc, doctype ); |
2074 | 0 | lexer->versionEmitted = TY_(ApparentVersion)( doc ); |
2075 | 0 | return yes; |
2076 | 0 | } |
2077 | | |
2078 | 0 | if (cfgBool(doc, TidyXmlOut)) |
2079 | 0 | return yes; |
2080 | | |
2081 | 0 | if (doctype) |
2082 | 0 | hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL; |
2083 | |
|
2084 | 0 | if ((dtmode == TidyDoctypeStrict || |
2085 | 0 | dtmode == TidyDoctypeLoose) && doctype) |
2086 | 0 | { |
2087 | 0 | TY_(DiscardElement)(doc, doctype); |
2088 | 0 | doctype = NULL; |
2089 | 0 | } |
2090 | |
|
2091 | 0 | switch (dtmode) |
2092 | 0 | { |
2093 | 0 | case TidyDoctypeHtml5: |
2094 | 0 | guessed = HT50; |
2095 | 0 | break; |
2096 | 0 | case TidyDoctypeStrict: |
2097 | 0 | guessed = H41S; |
2098 | 0 | break; |
2099 | 0 | case TidyDoctypeLoose: |
2100 | 0 | guessed = H41T; |
2101 | 0 | break; |
2102 | 0 | case TidyDoctypeAuto: |
2103 | 0 | guessed = TY_(HTMLVersion)(doc); |
2104 | 0 | break; |
2105 | 0 | } |
2106 | | |
2107 | 0 | lexer->versionEmitted = guessed; |
2108 | 0 | if (guessed == VERS_UNKNOWN) |
2109 | 0 | return no; |
2110 | | |
2111 | 0 | if (doctype) |
2112 | 0 | { |
2113 | 0 | doctype->element = TY_(tmbstrtolower)(doctype->element); |
2114 | 0 | } |
2115 | 0 | else |
2116 | 0 | { |
2117 | 0 | doctype = NewDocTypeNode(doc); |
2118 | 0 | doctype->element = TY_(tmbstrdup)(doc->allocator, "html"); |
2119 | 0 | } |
2120 | |
|
2121 | 0 | TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed)); |
2122 | |
|
2123 | 0 | if (hadSI) |
2124 | 0 | TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed)); |
2125 | |
|
2126 | 0 | return yes; |
2127 | 0 | } |
2128 | | |
2129 | | /* ensure XML document starts with <?xml version="1.0"?> */ |
2130 | | /* add encoding attribute if not using ASCII or UTF-8 output */ |
2131 | | Bool TY_(FixXmlDecl)( TidyDocImpl* doc ) |
2132 | 0 | { |
2133 | 0 | Node* xml; |
2134 | 0 | AttVal *version, *encoding; |
2135 | 0 | Lexer*lexer = doc->lexer; |
2136 | 0 | Node* root = &doc->root; |
2137 | |
|
2138 | 0 | if ( root->content && root->content->type == XmlDecl ) |
2139 | 0 | { |
2140 | 0 | xml = root->content; |
2141 | 0 | } |
2142 | 0 | else |
2143 | 0 | { |
2144 | 0 | xml = TY_(NewNode)(lexer->allocator, lexer); |
2145 | 0 | xml->type = XmlDecl; |
2146 | 0 | if ( root->content ) |
2147 | 0 | TY_(InsertNodeBeforeElement)(root->content, xml); |
2148 | 0 | else |
2149 | 0 | root->content = xml; |
2150 | 0 | } |
2151 | |
|
2152 | 0 | version = TY_(GetAttrByName)(xml, "version"); |
2153 | 0 | encoding = TY_(GetAttrByName)(xml, "encoding"); |
2154 | | |
2155 | | /* |
2156 | | We need to insert a check if declared encoding |
2157 | | and output encoding mismatch and fix the XML |
2158 | | declaration accordingly!!! |
2159 | | */ |
2160 | |
|
2161 | 0 | if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 ) |
2162 | 0 | { |
2163 | 0 | ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding)); |
2164 | 0 | if ( enc ) |
2165 | 0 | TY_(AddAttribute)( doc, xml, "encoding", enc ); |
2166 | 0 | } |
2167 | |
|
2168 | 0 | if ( version == NULL ) |
2169 | 0 | TY_(AddAttribute)( doc, xml, "version", "1.0" ); |
2170 | 0 | return yes; |
2171 | 0 | } |
2172 | | |
2173 | | Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id) |
2174 | 238k | { |
2175 | 238k | Lexer *lexer = doc->lexer; |
2176 | 238k | Node *node = TY_(NewNode)( lexer->allocator, lexer ); |
2177 | 238k | const Dict* dict = TY_(LookupTagDef)(id); |
2178 | | |
2179 | 238k | assert( dict != NULL ); |
2180 | | |
2181 | 238k | node->type = StartTag; |
2182 | 238k | node->implicit = yes; |
2183 | 238k | node->element = TY_(tmbstrdup)(doc->allocator, dict->name); |
2184 | 238k | node->tag = dict; |
2185 | 238k | node->start = lexer->txtstart; |
2186 | 238k | node->end = lexer->txtend; |
2187 | | |
2188 | 238k | return node; |
2189 | 238k | } |
2190 | | |
2191 | | static Bool ExpectsContent(Node *node) |
2192 | 931k | { |
2193 | 931k | if (node->type != StartTag) |
2194 | 13.7k | return no; |
2195 | | |
2196 | | /* unknown element? */ |
2197 | 917k | if (node->tag == NULL) |
2198 | 162k | return yes; |
2199 | | |
2200 | 754k | if (node->tag->model & CM_EMPTY) |
2201 | 67.6k | return no; |
2202 | | |
2203 | 687k | return yes; |
2204 | 754k | } |
2205 | | |
2206 | | /* |
2207 | | create a text node for the contents of |
2208 | | a CDATA element like style or script |
2209 | | which ends with </foo> for some foo. |
2210 | | */ |
2211 | | |
2212 | | typedef enum |
2213 | | { |
2214 | | CDATA_INTERMEDIATE, |
2215 | | CDATA_STARTTAG, |
2216 | | CDATA_ENDTAG |
2217 | | } CDATAState; |
2218 | | |
2219 | | static Node *GetCDATA( TidyDocImpl* doc, Node *container ) |
2220 | 3.41k | { |
2221 | 3.41k | Lexer* lexer = doc->lexer; |
2222 | 3.41k | uint start = 0; |
2223 | 3.41k | int nested = 0; |
2224 | 3.41k | CDATAState state = CDATA_INTERMEDIATE; |
2225 | 3.41k | uint i; |
2226 | 3.41k | Bool isEmpty = yes; |
2227 | 3.41k | Bool matches = no; |
2228 | 3.41k | uint c; |
2229 | 3.41k | Bool hasSrc = (TY_(AttrGetById)(container, TidyAttr_SRC) != NULL) ? yes : no; |
2230 | | /*\ Issue #65 (1642186) and #280 - is script or style, and the option on |
2231 | | * If yes, then avoid incrementing nested... |
2232 | | \*/ |
2233 | 3.41k | Bool nonested = ((nodeIsSCRIPT(container) || (nodeIsSTYLE(container))) && |
2234 | 3.41k | cfgBool(doc, TidySkipNested)) ? yes : no; |
2235 | | |
2236 | 3.41k | SetLexerLocus( doc, lexer ); |
2237 | 3.41k | lexer->waswhite = no; |
2238 | 3.41k | lexer->txtstart = lexer->txtend = lexer->lexsize; |
2239 | | |
2240 | | /* seen start tag, look for matching end tag */ |
2241 | 13.2M | while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) |
2242 | 13.2M | { |
2243 | 13.2M | TY_(AddCharToLexer)(lexer, c); |
2244 | 13.2M | lexer->txtend = lexer->lexsize; |
2245 | | |
2246 | 13.2M | if (state == CDATA_INTERMEDIATE) |
2247 | 13.0M | { |
2248 | 13.0M | if (c != '<') |
2249 | 13.0M | { |
2250 | 13.0M | if (isEmpty && !TY_(IsWhite)(c)) |
2251 | 990 | isEmpty = no; |
2252 | 13.0M | continue; |
2253 | 13.0M | } |
2254 | | |
2255 | 22.6k | c = TY_(ReadChar)(doc->docIn); |
2256 | | |
2257 | 22.6k | if (TY_(IsLetter)(c)) |
2258 | 10.7k | { |
2259 | | /* <head><script src=foo><meta name=foo content=bar>*/ |
2260 | 10.7k | if (hasSrc && isEmpty && nodeIsSCRIPT(container)) |
2261 | 1.00k | { |
2262 | | /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ |
2263 | 1.00k | lexer->lexsize = lexer->txtstart; |
2264 | 1.00k | TY_(UngetChar)(c, doc->docIn); |
2265 | 1.00k | TY_(UngetChar)('<', doc->docIn); |
2266 | 1.00k | return NULL; |
2267 | 1.00k | } |
2268 | 9.75k | TY_(AddCharToLexer)(lexer, c); |
2269 | 9.75k | start = lexer->lexsize - 1; |
2270 | 9.75k | state = CDATA_STARTTAG; |
2271 | 9.75k | } |
2272 | 11.9k | else if (c == '/') |
2273 | 7.19k | { |
2274 | 7.19k | TY_(AddCharToLexer)(lexer, c); |
2275 | | |
2276 | 7.19k | c = TY_(ReadChar)(doc->docIn); |
2277 | | |
2278 | 7.19k | if (!TY_(IsLetter)(c)) |
2279 | 1.26k | { |
2280 | 1.26k | TY_(UngetChar)(c, doc->docIn); |
2281 | 1.26k | continue; |
2282 | 1.26k | } |
2283 | 5.93k | TY_(UngetChar)(c, doc->docIn); |
2284 | | |
2285 | 5.93k | start = lexer->lexsize; |
2286 | 5.93k | state = CDATA_ENDTAG; |
2287 | 5.93k | } |
2288 | 4.72k | else if (c == '\\') |
2289 | 1.72k | { |
2290 | | /* recognize document.write("<script><\/script>") */ |
2291 | 1.72k | TY_(AddCharToLexer)(lexer, c); |
2292 | | |
2293 | 1.72k | c = TY_(ReadChar)(doc->docIn); |
2294 | | |
2295 | 1.72k | if (c != '/') |
2296 | 375 | { |
2297 | 375 | TY_(UngetChar)(c, doc->docIn); |
2298 | 375 | continue; |
2299 | 375 | } |
2300 | | |
2301 | 1.34k | TY_(AddCharToLexer)(lexer, c); |
2302 | | |
2303 | 1.34k | if (nonested) { |
2304 | | /*\ |
2305 | | * Issue #65 - for version 5.1.14.EXP2 |
2306 | | * If the nonested option is ON then the <script> |
2307 | | * tag did not bump nested, so no need to treat this as |
2308 | | * an end tag just to decrease nested, just continue! |
2309 | | \*/ |
2310 | 204 | continue; |
2311 | 204 | } |
2312 | | |
2313 | 1.14k | c = TY_(ReadChar)(doc->docIn); |
2314 | | |
2315 | 1.14k | if (!TY_(IsLetter)(c)) |
2316 | 516 | { |
2317 | 516 | TY_(UngetChar)(c, doc->docIn); |
2318 | 516 | continue; |
2319 | 516 | } |
2320 | 628 | TY_(UngetChar)(c, doc->docIn); |
2321 | | |
2322 | 628 | start = lexer->lexsize; |
2323 | 628 | state = CDATA_ENDTAG; |
2324 | 628 | } |
2325 | 3.00k | else |
2326 | 3.00k | { |
2327 | 3.00k | TY_(UngetChar)(c, doc->docIn); |
2328 | 3.00k | } |
2329 | 22.6k | } |
2330 | | /* '<' + Letter found */ |
2331 | 259k | else if (state == CDATA_STARTTAG) |
2332 | 26.8k | { |
2333 | 26.8k | if (TY_(IsLetter)(c)) |
2334 | 17.1k | continue; |
2335 | | |
2336 | 9.70k | matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start, |
2337 | 9.70k | TY_(tmbstrlen)(container->element)) == 0; |
2338 | 9.70k | if (matches && !nonested) |
2339 | 1.24k | nested++; |
2340 | | |
2341 | 9.70k | state = CDATA_INTERMEDIATE; |
2342 | 9.70k | } |
2343 | | /* '<' + '/' + Letter found */ |
2344 | 232k | else if (state == CDATA_ENDTAG) |
2345 | 232k | { |
2346 | 232k | if (TY_(IsLetter)(c)) |
2347 | 225k | continue; |
2348 | | |
2349 | 6.55k | matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start, |
2350 | 6.55k | TY_(tmbstrlen)(container->element)) == 0; |
2351 | | |
2352 | 6.55k | if (isEmpty && !matches) |
2353 | 945 | { |
2354 | | /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ |
2355 | | |
2356 | 61.2k | for (i = lexer->lexsize - 1; i >= start; --i) |
2357 | 60.2k | TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn); |
2358 | 945 | TY_(UngetChar)('/', doc->docIn); |
2359 | 945 | TY_(UngetChar)('<', doc->docIn); |
2360 | 945 | break; |
2361 | 945 | } |
2362 | | |
2363 | 5.60k | if (matches && nested-- <= 0) |
2364 | 1.05k | { |
2365 | 49.1k | for (i = lexer->lexsize - 1; i >= start; --i) |
2366 | 48.0k | TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn); |
2367 | 1.05k | TY_(UngetChar)('/', doc->docIn); |
2368 | 1.05k | TY_(UngetChar)('<', doc->docIn); |
2369 | 1.05k | lexer->lexsize -= (lexer->lexsize - start) + 2; |
2370 | 1.05k | break; |
2371 | 1.05k | } |
2372 | 4.55k | else if (lexer->lexbuf[start - 2] != '\\') |
2373 | 3.92k | { |
2374 | | /* if the end tag is not already escaped using backslash */ |
2375 | 3.92k | SetLexerLocus( doc, lexer ); |
2376 | 3.92k | lexer->columns -= 3; |
2377 | | |
2378 | | /*\ if javascript insert backslash before / |
2379 | | * Issue #348 - Add option, escape-scripts, to skip |
2380 | | \*/ |
2381 | 3.92k | if ((TY_(IsJavaScript)(container)) && cfgBool(doc, TidyEscapeScripts) && |
2382 | 3.92k | !TY_(IsHTML5Mode)(doc) ) /* Is #700 - This only applies to legacy html4 mode */ |
2383 | 413 | { |
2384 | | /* Issue #281 - only warn if adding the escape! */ |
2385 | 413 | TY_(Report)(doc, NULL, NULL, BAD_CDATA_CONTENT); |
2386 | | |
2387 | 111k | for (i = lexer->lexsize; i > start-1; --i) |
2388 | 111k | lexer->lexbuf[i] = lexer->lexbuf[i-1]; |
2389 | | |
2390 | 413 | lexer->lexbuf[start-1] = '\\'; |
2391 | 413 | lexer->lexsize++; |
2392 | 413 | } |
2393 | 3.92k | } |
2394 | 4.55k | state = CDATA_INTERMEDIATE; |
2395 | 4.55k | } |
2396 | 13.2M | } |
2397 | 2.41k | if (isEmpty) |
2398 | 1.42k | lexer->lexsize = lexer->txtstart = lexer->txtend; |
2399 | 990 | else |
2400 | 990 | lexer->txtend = lexer->lexsize; |
2401 | | |
2402 | 2.41k | if (c == EndOfStream) |
2403 | 416 | TY_(Report)(doc, container, NULL, MISSING_ENDTAG_FOR ); |
2404 | | |
2405 | 2.41k | return TY_(TextToken)(lexer); |
2406 | 3.41k | } |
2407 | | |
2408 | | void TY_(UngetToken)( TidyDocImpl* doc ) |
2409 | 990k | { |
2410 | 990k | doc->lexer->pushed = yes; |
2411 | 990k | } |
2412 | | |
2413 | | #if defined(ENABLE_DEBUG_LOG) |
2414 | | # define CondReturnTextNode(doc, skip) \ |
2415 | | if (lexer->txtend > lexer->txtstart) { \ |
2416 | | Node *_node = TY_(TextToken)(lexer); \ |
2417 | | lexer->token = _node; \ |
2418 | | GTDBG(doc,"text_node",_node); \ |
2419 | | return _node; \ |
2420 | | } |
2421 | | |
2422 | | #else |
2423 | | # define CondReturnTextNode(doc, skip) \ |
2424 | 1.09M | if (lexer->txtend > lexer->txtstart) \ |
2425 | 1.09M | { \ |
2426 | 167k | lexer->token = TY_(TextToken)(lexer); \ |
2427 | 167k | return lexer->token; \ |
2428 | 167k | } |
2429 | | #endif |
2430 | | |
2431 | | /* |
2432 | | modes for GetToken() |
2433 | | |
2434 | | MixedContent -- for elements which don't accept PCDATA |
2435 | | Preformatted -- white space preserved as is |
2436 | | IgnoreMarkup -- for CDATA elements such as script, style |
2437 | | */ |
2438 | | static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode ); |
2439 | | |
2440 | | Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ) |
2441 | 5.55M | { |
2442 | 5.55M | Node *node; |
2443 | 5.55M | Lexer* lexer = doc->lexer; |
2444 | | |
2445 | 5.55M | if (lexer->pushed || lexer->itoken) |
2446 | 1.00M | { |
2447 | | /* Deal with previously returned duplicate inline token */ |
2448 | 1.00M | if (lexer->itoken) |
2449 | 14.4k | { |
2450 | | /* itoken rejected */ |
2451 | 14.4k | if (lexer->pushed) |
2452 | 6 | { |
2453 | 6 | lexer->pushed = no; |
2454 | 6 | node = lexer->itoken; |
2455 | 6 | GTDBG(doc,"lex-itoken", node); |
2456 | 6 | return node; |
2457 | 6 | } |
2458 | | /* itoken has been accepted */ |
2459 | 14.4k | lexer->itoken = NULL; |
2460 | 14.4k | } |
2461 | | |
2462 | | /* duplicate inlines in preference to pushed text nodes when appropriate */ |
2463 | 1.00M | lexer->pushed = no; |
2464 | 1.00M | if (lexer->token->type != TextNode |
2465 | 1.00M | || !(lexer->insert || lexer->inode)) { |
2466 | 990k | node = lexer->token; |
2467 | 990k | GTDBG(doc,"lex-token", node); |
2468 | 990k | return node; |
2469 | 990k | } |
2470 | 14.4k | lexer->itoken = TY_(InsertedToken)( doc ); |
2471 | 14.4k | node = lexer->itoken; |
2472 | 14.4k | GTDBG(doc,"lex-inserted", node); |
2473 | 14.4k | return node; |
2474 | 1.00M | } |
2475 | | |
2476 | 4.55M | assert( !(lexer->pushed || lexer->itoken) ); |
2477 | | |
2478 | | /* at start of block elements, unclosed inline |
2479 | | elements are inserted into the token stream |
2480 | | Issue #341 - Can NOT insert a token if NO istacksize |
2481 | | */ |
2482 | 4.55M | if ((lexer->insert || lexer->inode) && lexer->istacksize) |
2483 | 1.66M | { |
2484 | | /*\ Issue #92: could fix by the following, but instead chose not to stack these 2 |
2485 | | * if ( !(lexer->insert && (nodeIsINS(lexer->insert) || nodeIsDEL(lexer->insert))) ) { |
2486 | | \*/ |
2487 | 1.66M | lexer->token = TY_(InsertedToken)( doc ); |
2488 | 1.66M | node = lexer->token; |
2489 | 1.66M | GTDBG(doc,"lex-inserted2", node); |
2490 | 1.66M | return node; |
2491 | 1.66M | } |
2492 | | |
2493 | 2.88M | if (mode == CdataContent) |
2494 | 3.41k | { |
2495 | 3.41k | assert( lexer->parent != NULL ); |
2496 | 3.41k | node = GetCDATA(doc, lexer->parent); |
2497 | 3.41k | GTDBG(doc,"lex-cdata", node); |
2498 | 3.41k | return node; |
2499 | 3.41k | } |
2500 | | |
2501 | 2.88M | return GetTokenFromStream( doc, mode ); |
2502 | 2.88M | } |
2503 | | |
2504 | | #if defined(ENABLE_DEBUG_LOG) |
2505 | | static void check_me(char *name) |
2506 | | { |
2507 | | SPRTF("Have node %s\n", name); |
2508 | | } |
2509 | | #endif |
2510 | | |
2511 | | static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode ) |
2512 | 2.88M | { |
2513 | 2.88M | Lexer* lexer = doc->lexer; |
2514 | 2.88M | uint c, lexdump, badcomment = 0; |
2515 | 2.88M | Bool isempty = no; |
2516 | 2.88M | AttVal *attributes = NULL; |
2517 | 2.88M | Node *node; |
2518 | 2.88M | Bool fixComments; |
2519 | | |
2520 | 2.88M | switch ( cfgAutoBool(doc, TidyFixComments) ) |
2521 | 2.88M | { |
2522 | 0 | case TidyYesState: |
2523 | 0 | fixComments = yes; |
2524 | 0 | break; |
2525 | | |
2526 | 0 | case TidyNoState: |
2527 | 0 | fixComments = no; |
2528 | 0 | break; |
2529 | | |
2530 | 2.88M | default: |
2531 | 2.88M | fixComments = (TY_(HTMLVersion)(doc) & HT50) == 0; |
2532 | 2.88M | break; |
2533 | 2.88M | } |
2534 | | |
2535 | | /* Lexer->token must be set on return. Nullify it for safety. */ |
2536 | 2.88M | lexer->token = NULL; |
2537 | | |
2538 | 2.88M | SetLexerLocus( doc, lexer ); |
2539 | 2.88M | lexer->waswhite = no; |
2540 | | |
2541 | 2.88M | lexer->txtstart = lexer->txtend = lexer->lexsize; |
2542 | | |
2543 | 243M | while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) |
2544 | 241M | { |
2545 | 241M | if (lexer->insertspace) |
2546 | 2.39k | { |
2547 | 2.39k | TY_(AddCharToLexer)(lexer, ' '); |
2548 | 2.39k | lexer->waswhite = yes; |
2549 | 2.39k | lexer->insertspace = no; |
2550 | 2.39k | } |
2551 | | |
2552 | 241M | if (c == 160 && (mode == Preformatted)) |
2553 | 307 | c = ' '; |
2554 | | |
2555 | 241M | TY_(AddCharToLexer)(lexer, c); |
2556 | | |
2557 | 241M | switch (lexer->state) |
2558 | 241M | { |
2559 | 102M | case LEX_CONTENT: /* element content */ |
2560 | | |
2561 | | /* |
2562 | | Discard white space if appropriate. Its cheaper |
2563 | | to do this here rather than in parser methods |
2564 | | for elements that don't have mixed content. |
2565 | | */ |
2566 | 102M | if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) |
2567 | 102M | && lexer->lexsize == lexer->txtstart + 1) |
2568 | 118k | { |
2569 | 118k | --(lexer->lexsize); |
2570 | 118k | lexer->waswhite = no; |
2571 | 118k | SetLexerLocus( doc, lexer ); |
2572 | 118k | continue; |
2573 | 118k | } |
2574 | | |
2575 | 101M | if (c == '<') |
2576 | 1.32M | { |
2577 | 1.32M | lexer->state = LEX_GT; |
2578 | 1.32M | continue; |
2579 | 1.32M | } |
2580 | | |
2581 | 100M | if (TY_(IsWhite)(c)) |
2582 | 89.4M | { |
2583 | | /* was previous character white? */ |
2584 | 89.4M | if (lexer->waswhite) |
2585 | 89.2M | { |
2586 | 89.2M | if (mode != Preformatted && mode != IgnoreMarkup) |
2587 | 1.76M | { |
2588 | 1.76M | --(lexer->lexsize); |
2589 | 1.76M | SetLexerLocus( doc, lexer ); |
2590 | 1.76M | } |
2591 | 89.2M | } |
2592 | 190k | else /* prev character wasn't white */ |
2593 | 190k | { |
2594 | 190k | lexer->waswhite = yes; |
2595 | | |
2596 | 190k | if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') |
2597 | 67.9k | ChangeChar(lexer, ' '); |
2598 | 190k | } |
2599 | | |
2600 | 89.4M | continue; |
2601 | 89.4M | } |
2602 | 11.1M | else if (c == '&' && mode != IgnoreMarkup) |
2603 | 77.4k | ParseEntity( doc, mode ); |
2604 | | |
2605 | | /* this is needed to avoid trimming trailing whitespace */ |
2606 | 11.1M | if (mode == IgnoreWhitespace) |
2607 | 47.4k | mode = MixedContent; |
2608 | | |
2609 | 11.1M | lexer->waswhite = no; |
2610 | 11.1M | continue; |
2611 | | |
2612 | 1.32M | case LEX_GT: /* < */ |
2613 | | |
2614 | | /* check for endtag */ |
2615 | 1.32M | if (c == '/') |
2616 | 116k | { |
2617 | 116k | if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) |
2618 | 46 | { |
2619 | 46 | TY_(UngetChar)(c, doc->docIn); |
2620 | 46 | continue; |
2621 | 46 | } |
2622 | | |
2623 | 116k | TY_(AddCharToLexer)(lexer, c); |
2624 | | |
2625 | 116k | if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c))) |
2626 | 104k | { |
2627 | 104k | lexer->lexsize -= 3; |
2628 | 104k | lexer->txtend = lexer->lexsize; |
2629 | 104k | TY_(UngetChar)(c, doc->docIn); |
2630 | 104k | lexer->state = LEX_ENDTAG; |
2631 | 104k | lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */ |
2632 | 104k | doc->docIn->curcol -= 2; |
2633 | | |
2634 | | /* if some text before the </ return it now */ |
2635 | 104k | if (lexer->txtend > lexer->txtstart) |
2636 | 13.2k | { |
2637 | | /* trim space character before end tag */ |
2638 | 13.2k | if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ') |
2639 | 364 | { |
2640 | 364 | lexer->lexsize -= 1; |
2641 | 364 | lexer->txtend = lexer->lexsize; |
2642 | 364 | } |
2643 | 13.2k | lexer->token = TY_(TextToken)(lexer); |
2644 | 13.2k | node = lexer->token; |
2645 | 13.2k | GTDBG(doc,"text", node); |
2646 | 13.2k | return node; |
2647 | 13.2k | } |
2648 | | |
2649 | 91.1k | continue; /* no text so keep going */ |
2650 | 104k | } |
2651 | | |
2652 | | /* otherwise treat as CDATA */ |
2653 | 11.6k | lexer->waswhite = no; |
2654 | 11.6k | lexer->state = LEX_CONTENT; |
2655 | 11.6k | continue; |
2656 | 116k | } |
2657 | | |
2658 | 1.20M | if (mode == IgnoreMarkup) |
2659 | 0 | { |
2660 | | /* otherwise treat as CDATA */ |
2661 | 0 | lexer->waswhite = no; |
2662 | 0 | lexer->state = LEX_CONTENT; |
2663 | 0 | continue; |
2664 | 0 | } |
2665 | | |
2666 | | /* |
2667 | | look out for comments, doctype or marked sections |
2668 | | this isn't quite right, but its getting there ... |
2669 | | */ |
2670 | 1.20M | if (c == '!') |
2671 | 67.4k | { |
2672 | 67.4k | c = TY_(ReadChar)(doc->docIn); |
2673 | | |
2674 | 67.4k | if (c == '-') |
2675 | 4.99k | { |
2676 | 4.99k | c = TY_(ReadChar)(doc->docIn); |
2677 | | |
2678 | 4.99k | if (c == '-') |
2679 | 4.67k | { |
2680 | 4.67k | lexer->state = LEX_COMMENT; /* comment */ |
2681 | 4.67k | lexer->lexsize -= 2; |
2682 | 4.67k | lexer->txtend = lexer->lexsize; |
2683 | | |
2684 | 4.67k | CondReturnTextNode(doc, 4) |
2685 | | |
2686 | 4.35k | lexer->txtstart = lexer->lexsize; |
2687 | 4.35k | continue; |
2688 | 4.67k | } |
2689 | | |
2690 | | /* |
2691 | | TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING ); |
2692 | | Warning now done later - see issue #487 |
2693 | | */ |
2694 | 4.99k | } |
2695 | 62.4k | else if (c == 'd' || c == 'D') |
2696 | 55.1k | { |
2697 | | /* todo: check for complete "<!DOCTYPE" not just <!D */ |
2698 | | |
2699 | 55.1k | uint skip = 0; |
2700 | | |
2701 | 55.1k | lexer->state = LEX_DOCTYPE; /* doctype */ |
2702 | 55.1k | lexer->lexsize -= 2; |
2703 | 55.1k | lexer->txtend = lexer->lexsize; |
2704 | 55.1k | mode = IgnoreWhitespace; |
2705 | | |
2706 | | /* skip until white space or '>' */ |
2707 | | |
2708 | 55.1k | for (;;) |
2709 | 93.1k | { |
2710 | 93.1k | c = TY_(ReadChar)(doc->docIn); |
2711 | 93.1k | ++skip; |
2712 | | |
2713 | 93.1k | if (c == EndOfStream || c == '>') |
2714 | 37.4k | { |
2715 | 37.4k | TY_(UngetChar)(c, doc->docIn); |
2716 | 37.4k | break; |
2717 | 37.4k | } |
2718 | | |
2719 | | |
2720 | 55.7k | if (!TY_(IsWhite)(c)) |
2721 | 38.0k | continue; |
2722 | | |
2723 | | /* and skip to end of whitespace */ |
2724 | | |
2725 | 17.6k | for (;;) |
2726 | 44.1k | { |
2727 | 44.1k | c = TY_(ReadChar)(doc->docIn); |
2728 | 44.1k | ++skip; |
2729 | | |
2730 | 44.1k | if (c == EndOfStream || c == '>') |
2731 | 1.41k | { |
2732 | 1.41k | TY_(UngetChar)(c, doc->docIn); |
2733 | 1.41k | break; |
2734 | 1.41k | } |
2735 | | |
2736 | | |
2737 | 42.6k | if (TY_(IsWhite)(c)) |
2738 | 26.4k | continue; |
2739 | | |
2740 | 16.2k | TY_(UngetChar)(c, doc->docIn); |
2741 | 16.2k | break; |
2742 | 42.6k | } |
2743 | | |
2744 | 17.6k | break; |
2745 | 55.7k | } |
2746 | | |
2747 | 55.1k | CondReturnTextNode(doc, (skip + 3)) |
2748 | | |
2749 | 42.6k | lexer->txtstart = lexer->lexsize; |
2750 | 42.6k | continue; |
2751 | 55.1k | } |
2752 | 7.37k | else if (c == '[') |
2753 | 3.78k | { |
2754 | | /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */ |
2755 | 3.78k | lexer->lexsize -= 2; |
2756 | 3.78k | lexer->state = LEX_SECTION; |
2757 | 3.78k | lexer->txtend = lexer->lexsize; |
2758 | | |
2759 | 3.78k | CondReturnTextNode(doc, 2) |
2760 | | |
2761 | 2.75k | lexer->txtstart = lexer->lexsize; |
2762 | 2.75k | continue; |
2763 | 3.78k | } |
2764 | | |
2765 | | |
2766 | | /* |
2767 | | We only print this message if there's a missing |
2768 | | starting hyphen; this comment will be dropped. |
2769 | | */ |
2770 | 3.90k | TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING ); /* Is. #487 */ |
2771 | | |
2772 | | /* else swallow characters up to and including next '>' */ |
2773 | 104k | while ((c = TY_(ReadChar)(doc->docIn)) != '>') |
2774 | 100k | { |
2775 | 100k | if (c == EndOfStream) |
2776 | 142 | { |
2777 | 142 | TY_(UngetChar)(c, doc->docIn); |
2778 | 142 | break; |
2779 | 142 | } |
2780 | 100k | } |
2781 | | |
2782 | 3.90k | lexer->lexsize -= 2; |
2783 | 3.90k | lexer->lexbuf[lexer->lexsize] = '\0'; |
2784 | 3.90k | lexer->state = LEX_CONTENT; |
2785 | 3.90k | continue; |
2786 | 67.4k | } |
2787 | | |
2788 | | /* |
2789 | | processing instructions |
2790 | | */ |
2791 | | |
2792 | 1.14M | if (c == '?') |
2793 | 21.2k | { |
2794 | 21.2k | lexer->lexsize -= 2; |
2795 | 21.2k | lexer->state = LEX_PROCINSTR; |
2796 | 21.2k | lexer->txtend = lexer->lexsize; |
2797 | | |
2798 | 21.2k | CondReturnTextNode(doc, 2) |
2799 | | |
2800 | 12.7k | lexer->txtstart = lexer->lexsize; |
2801 | 12.7k | continue; |
2802 | 21.2k | } |
2803 | | |
2804 | | /* Microsoft ASP's e.g. <% ... server-code ... %> */ |
2805 | 1.12M | if (c == '%') |
2806 | 1.57k | { |
2807 | 1.57k | lexer->lexsize -= 2; |
2808 | 1.57k | lexer->state = LEX_ASP; |
2809 | 1.57k | lexer->txtend = lexer->lexsize; |
2810 | | |
2811 | 1.57k | CondReturnTextNode(doc, 2) |
2812 | | |
2813 | 619 | lexer->txtstart = lexer->lexsize; |
2814 | 619 | continue; |
2815 | 1.57k | } |
2816 | | |
2817 | | /* Netscapes JSTE e.g. <# ... server-code ... #> */ |
2818 | 1.11M | if (c == '#') |
2819 | 898 | { |
2820 | 898 | lexer->lexsize -= 2; |
2821 | 898 | lexer->state = LEX_JSTE; |
2822 | 898 | lexer->txtend = lexer->lexsize; |
2823 | | |
2824 | 898 | CondReturnTextNode(doc, 2) |
2825 | | |
2826 | 340 | lexer->txtstart = lexer->lexsize; |
2827 | 340 | continue; |
2828 | 898 | } |
2829 | | |
2830 | | /* check for start tag */ |
2831 | 1.11M | if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c))) |
2832 | 1.00M | { |
2833 | 1.00M | TY_(UngetChar)(c, doc->docIn); /* push back letter */ |
2834 | 1.00M | TY_(UngetChar)('<', doc->docIn); |
2835 | 1.00M | lexer->lexsize -= 2; /* discard "<" + letter */ |
2836 | 1.00M | lexer->txtend = lexer->lexsize; |
2837 | 1.00M | lexer->state = LEX_STARTTAG; /* ready to read tag name */ |
2838 | | |
2839 | 1.00M | CondReturnTextNode(doc, 2) |
2840 | | |
2841 | | /* lexer->txtstart = lexer->lexsize; missing here? */ |
2842 | 862k | continue; /* no text so keep going */ |
2843 | 1.00M | } |
2844 | | |
2845 | | /* otherwise treat as CDATA */ |
2846 | | /* fix for bug 762102 (486) */ |
2847 | | /* Issue #384 - Fix skipping parsing character, particularly '<<' */ |
2848 | 112k | TY_(UngetChar)(c, doc->docIn); |
2849 | 112k | lexer->lexsize -= 1; |
2850 | 112k | lexer->state = LEX_CONTENT; |
2851 | 112k | lexer->waswhite = no; |
2852 | 112k | continue; |
2853 | | |
2854 | 105k | case LEX_ENDTAG: /* </letter */ |
2855 | 105k | lexer->txtstart = lexer->lexsize - 1; |
2856 | 105k | doc->docIn->curcol += 2; |
2857 | 105k | c = ParseTagName( doc ); |
2858 | 105k | lexer->token = TagToken( doc, EndTag ); /* create endtag token */ |
2859 | 105k | lexer->lexsize = lexer->txtend = lexer->txtstart; |
2860 | | |
2861 | | /* skip to '>' */ |
2862 | 180k | while ( c != '>' && c != EndOfStream ) |
2863 | 74.9k | { |
2864 | 74.9k | c = TY_(ReadChar)(doc->docIn); |
2865 | 74.9k | } |
2866 | | |
2867 | 105k | if (c == EndOfStream) |
2868 | 969 | { |
2869 | 969 | TY_(FreeNode)( doc, lexer->token ); |
2870 | 969 | continue; |
2871 | 969 | } |
2872 | | |
2873 | 104k | lexer->state = LEX_CONTENT; |
2874 | 104k | lexer->waswhite = no; |
2875 | 104k | node = lexer->token; |
2876 | 104k | GTDBG(doc,"endtag", node); |
2877 | 104k | return node; /* the endtag token */ |
2878 | | |
2879 | 1.00M | case LEX_STARTTAG: /* first letter of tagname */ |
2880 | 1.00M | c = TY_(ReadChar)(doc->docIn); |
2881 | 1.00M | ChangeChar(lexer, (tmbchar)c); |
2882 | 1.00M | lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */ |
2883 | 1.00M | c = ParseTagName( doc ); |
2884 | 1.00M | isempty = no; |
2885 | 1.00M | attributes = NULL; |
2886 | 1.00M | lexer->token = TagToken( doc, StartTag ); /* [i_a]2 'isempty' is always false, thanks to code 2 lines above */ |
2887 | | |
2888 | | /* parse attributes, consuming closing ">" */ |
2889 | 1.00M | if (c != '>') |
2890 | 391k | { |
2891 | 391k | if (c == '/') |
2892 | 54.9k | TY_(UngetChar)(c, doc->docIn); |
2893 | | |
2894 | 391k | attributes = ParseAttrs( doc, &isempty ); |
2895 | 391k | } |
2896 | | |
2897 | 1.00M | if (isempty) |
2898 | 13.8k | lexer->token->type = StartEndTag; |
2899 | | |
2900 | 1.00M | lexer->token->attributes = attributes; |
2901 | 1.00M | lexer->lexsize = lexer->txtend = lexer->txtstart; |
2902 | | |
2903 | | /* swallow newline following start tag */ |
2904 | | /* special check needed for CRLF sequence */ |
2905 | | /* this doesn't apply to empty elements */ |
2906 | | /* nor to preformatted content that needs escaping */ |
2907 | | /*\ |
2908 | | * Issue #230: Need to KEEP this user newline character in certain |
2909 | | * circumstances, certainly for <pre>, <script>, <style>... |
2910 | | * Any others? |
2911 | | * Issue #238: maybe **ONLY** for <pre> |
2912 | | \*/ |
2913 | 1.00M | if ( nodeIsPRE(lexer->token) ) |
2914 | 31.6k | { |
2915 | 31.6k | mode = Preformatted; |
2916 | 31.6k | } |
2917 | | |
2918 | 1.00M | if ((mode != Preformatted && ExpectsContent(lexer->token)) |
2919 | 1.00M | || nodeIsBR(lexer->token) || nodeIsHR(lexer->token)) |
2920 | 860k | { |
2921 | 860k | c = TY_(ReadChar)(doc->docIn); |
2922 | | |
2923 | 860k | if ((c == '\n') && (mode != IgnoreWhitespace)) /* Issue #329 - Can NOT afford to lose this newline */ |
2924 | 29.9k | TY_(UngetChar)(c, doc->docIn); /* Issue #329 - make sure the newline is maintained for now */ |
2925 | 830k | else if (c != '\n' && c != '\f') |
2926 | 828k | TY_(UngetChar)(c, doc->docIn); |
2927 | | |
2928 | 860k | lexer->waswhite = yes; /* to swallow leading whitespace */ |
2929 | 860k | } |
2930 | 145k | else |
2931 | 145k | lexer->waswhite = no; |
2932 | | |
2933 | 1.00M | lexer->state = LEX_CONTENT; |
2934 | 1.00M | if (lexer->token->tag == NULL) |
2935 | 169k | { |
2936 | 169k | if (mode != OtherNamespace) /* [i_a]2 only issue warning if NOT 'OtherNamespace', and tag null */ |
2937 | 164k | { |
2938 | | /* Special case for HTML5 unknown tags: if it looks |
2939 | | like an autonomous custom tag, then emit a variation |
2940 | | of the standard message. We don't want to do this |
2941 | | for older HTML, because it's not truly supported |
2942 | | by the standard, although Tidy will allow it. */ |
2943 | 164k | if ( (doc->lexer->doctype & VERS_HTML5) > 0 && TY_(elementIsAutonomousCustomFormat)( lexer->token->element ) ) |
2944 | 393 | TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT_LOOKS_CUSTOM ); |
2945 | 164k | else |
2946 | 164k | TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT ); |
2947 | 164k | } |
2948 | 169k | } |
2949 | 835k | else if ( !cfgBool(doc, TidyXmlTags) ) |
2950 | 835k | { |
2951 | 835k | TY_(ConstrainVersion)( doc, lexer->token->tag->versions ); |
2952 | 835k | TY_(RepairDuplicateAttributes)( doc, lexer->token, no ); |
2953 | 835k | } else |
2954 | 0 | TY_(RepairDuplicateAttributes)( doc, lexer->token, yes ); |
2955 | 1.00M | node = lexer->token; |
2956 | 1.00M | GTDBG(doc,"starttag", node); |
2957 | 1.00M | return node; /* return start tag */ |
2958 | | |
2959 | 2.56M | case LEX_COMMENT: /* seen <!-- so look for --> */ |
2960 | | |
2961 | 2.56M | if (c != '-') |
2962 | 2.55M | continue; |
2963 | | |
2964 | 10.4k | c = TY_(ReadChar)(doc->docIn); |
2965 | | |
2966 | | /* Fix hyphens at beginning of tag */ |
2967 | 10.4k | if ( c != '-' && fixComments && lexer->lexsize - lexer->txtstart == 1 ) |
2968 | 3.67k | { |
2969 | 3.67k | lexer->lexbuf[lexer->lexsize - 1] = '='; |
2970 | 3.67k | } |
2971 | | |
2972 | 10.4k | TY_(AddCharToLexer)(lexer, c); |
2973 | | |
2974 | 10.4k | if (c != '-') |
2975 | 4.09k | continue; |
2976 | | |
2977 | 8.05k | end_comment: |
2978 | 8.05k | c = TY_(ReadChar)(doc->docIn); |
2979 | | |
2980 | 8.05k | if (c == '>') |
2981 | 4.58k | { |
2982 | 4.58k | if (badcomment) |
2983 | 1.19k | { |
2984 | | /* |
2985 | | We've got bad comments that we either fixed or |
2986 | | ignored; provide proper user feedback based on |
2987 | | doctype and whether or not we fixed them. |
2988 | | */ |
2989 | 1.19k | if ( (TY_(HTMLVersion)(doc) & HT50) ) |
2990 | 0 | { |
2991 | 0 | if ( fixComments ) |
2992 | 0 | TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT ); |
2993 | | /* Otherwise for HTML5, it's safe to ignore. */ |
2994 | 0 | } |
2995 | 1.19k | else |
2996 | 1.19k | { |
2997 | 1.19k | if ( fixComments ) |
2998 | 1.19k | TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT ); |
2999 | 0 | else |
3000 | 0 | TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_WARN ); |
3001 | 1.19k | } |
3002 | 1.19k | } |
3003 | | |
3004 | | /* do not store closing -- in lexbuf */ |
3005 | 4.58k | lexer->lexsize -= 2; |
3006 | 4.58k | lexer->txtend = lexer->lexsize; |
3007 | 4.58k | lexer->lexbuf[lexer->lexsize] = '\0'; |
3008 | 4.58k | lexer->state = LEX_CONTENT; |
3009 | 4.58k | lexer->waswhite = no; |
3010 | 4.58k | lexer->token = CommentToken(doc); |
3011 | | |
3012 | | /* now look for a line break */ |
3013 | | |
3014 | 4.58k | c = TY_(ReadChar)(doc->docIn); |
3015 | | |
3016 | 4.58k | if (c == '\n') |
3017 | 334 | lexer->token->linebreak = yes; |
3018 | 4.25k | else |
3019 | 4.25k | TY_(UngetChar)(c, doc->docIn); |
3020 | | |
3021 | 4.58k | node = lexer->token; |
3022 | 4.58k | GTDBG(doc,"comment", node); |
3023 | 4.58k | return node; |
3024 | 4.58k | } |
3025 | | |
3026 | | /* note position of first such error in the comment */ |
3027 | 3.46k | if (!badcomment) |
3028 | 1.22k | { |
3029 | 1.22k | SetLexerLocus( doc, lexer ); |
3030 | 1.22k | lexer->columns -= 3; |
3031 | 1.22k | } |
3032 | | |
3033 | 3.46k | badcomment++; |
3034 | | |
3035 | | /* fix hyphens in the middle */ |
3036 | 3.46k | if ( fixComments ) |
3037 | 3.46k | lexer->lexbuf[lexer->lexsize - 2] = '='; |
3038 | | |
3039 | | /* if '-' then look for '>' to end the comment */ |
3040 | 3.46k | if (c == '-') |
3041 | 1.75k | { |
3042 | 1.75k | TY_(AddCharToLexer)(lexer, c); |
3043 | 1.75k | goto end_comment; |
3044 | 1.75k | } |
3045 | | |
3046 | | /* fix hyphens end, and continue to look for --> */ |
3047 | 1.71k | if ( fixComments ) |
3048 | 1.71k | lexer->lexbuf[lexer->lexsize - 1] = '='; |
3049 | | |
3050 | | /* http://tidy.sf.net/bug/1266647 */ |
3051 | 1.71k | TY_(AddCharToLexer)(lexer, c); |
3052 | | |
3053 | 1.71k | continue; |
3054 | | |
3055 | 55.0k | case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */ |
3056 | | |
3057 | | /* use ParseDocTypeDecl() to tokenize doctype declaration */ |
3058 | 55.0k | TY_(UngetChar)(c, doc->docIn); |
3059 | 55.0k | lexer->lexsize -= 1; |
3060 | 55.0k | lexer->token = ParseDocTypeDecl(doc); |
3061 | | |
3062 | 55.0k | lexer->txtend = lexer->lexsize; |
3063 | 55.0k | lexer->lexbuf[lexer->lexsize] = '\0'; |
3064 | 55.0k | lexer->state = LEX_CONTENT; |
3065 | 55.0k | lexer->waswhite = no; |
3066 | | |
3067 | | /* make a note of the version named by the 1st doctype */ |
3068 | 55.0k | if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags)) |
3069 | 10.9k | { |
3070 | 10.9k | lexer->doctype = FindGivenVersion(doc, lexer->token); |
3071 | 10.9k | if (lexer->doctype != VERS_HTML5) |
3072 | 10.8k | { |
3073 | | /*\ |
3074 | | * Back to legacy HTML4 mode for - |
3075 | | * Issue #167 & #169 - TidyTag_A |
3076 | | * Issue #196 - TidyTag_CAPTION |
3077 | | * others? |
3078 | | \*/ |
3079 | 10.8k | TY_(AdjustTags)(doc); /* Dynamically modify the tags table */ |
3080 | 10.8k | } |
3081 | 10.9k | } |
3082 | 55.0k | node = lexer->token; |
3083 | 55.0k | GTDBG(doc,"doctype", node); |
3084 | 55.0k | return node; |
3085 | | |
3086 | 43.3M | case LEX_PROCINSTR: /* seen <? so look for '>' */ |
3087 | | /* check for PHP preprocessor instructions <?php ... ?> */ |
3088 | | |
3089 | 43.3M | if (lexer->lexsize - lexer->txtstart == 3) |
3090 | 9.56k | { |
3091 | 9.56k | if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0) |
3092 | 234 | { |
3093 | 234 | lexer->state = LEX_PHP; |
3094 | 234 | continue; |
3095 | 234 | } |
3096 | 9.56k | } |
3097 | | |
3098 | 43.3M | if (lexer->lexsize - lexer->txtstart == 4) |
3099 | 9.02k | { |
3100 | 9.02k | if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 && |
3101 | 9.02k | TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3])) |
3102 | 4.95k | { |
3103 | 4.95k | lexer->state = LEX_XMLDECL; |
3104 | 4.95k | attributes = NULL; |
3105 | 4.95k | continue; |
3106 | 4.95k | } |
3107 | 9.02k | } |
3108 | | |
3109 | 43.3M | if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */ |
3110 | 2.40M | { |
3111 | 2.40M | if (c != '?') |
3112 | 2.40M | continue; |
3113 | | |
3114 | | /* now look for '>' */ |
3115 | 632 | c = TY_(ReadChar)(doc->docIn); |
3116 | | |
3117 | 632 | if (c == EndOfStream) |
3118 | 3 | { |
3119 | 3 | TY_(Report)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE ); |
3120 | 3 | TY_(UngetChar)(c, doc->docIn); |
3121 | 3 | continue; |
3122 | 3 | } |
3123 | | |
3124 | 629 | TY_(AddCharToLexer)(lexer, c); |
3125 | 629 | } |
3126 | | |
3127 | | |
3128 | 40.9M | if (c != '>') |
3129 | 40.8M | continue; |
3130 | | |
3131 | 15.9k | lexer->lexsize -= 1; |
3132 | | |
3133 | 15.9k | if (lexer->lexsize) |
3134 | 14.2k | { |
3135 | 14.2k | uint i; |
3136 | 14.2k | Bool closed; |
3137 | | |
3138 | 65.1k | for (i = 0; i < lexer->lexsize - lexer->txtstart && |
3139 | 65.1k | !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i) |
3140 | 50.8k | /**/; |
3141 | | |
3142 | 14.2k | closed = lexer->lexbuf[lexer->lexsize - 1] == '?'; |
3143 | | |
3144 | 14.2k | if (closed) |
3145 | 2.75k | lexer->lexsize -= 1; |
3146 | | |
3147 | 14.2k | lexer->txtstart += i; |
3148 | 14.2k | lexer->txtend = lexer->lexsize; |
3149 | 14.2k | lexer->lexbuf[lexer->lexsize] = '\0'; |
3150 | | |
3151 | 14.2k | lexer->token = PIToken(doc); |
3152 | 14.2k | lexer->token->closed = closed; |
3153 | 14.2k | lexer->token->element = TY_(tmbstrndup)(doc->allocator, |
3154 | 14.2k | lexer->lexbuf + |
3155 | 14.2k | lexer->txtstart - i, i); |
3156 | 14.2k | } |
3157 | 1.68k | else |
3158 | 1.68k | { |
3159 | 1.68k | lexer->txtend = lexer->lexsize; |
3160 | 1.68k | lexer->lexbuf[lexer->lexsize] = '\0'; |
3161 | 1.68k | lexer->token = PIToken(doc); |
3162 | 1.68k | } |
3163 | | |
3164 | 15.9k | lexer->state = LEX_CONTENT; |
3165 | 15.9k | lexer->waswhite = no; |
3166 | 15.9k | node = lexer->token; |
3167 | 15.9k | GTDBG(doc,"procinstr", node); |
3168 | 15.9k | return node; |
3169 | | |
3170 | 16.8k | case LEX_ASP: /* seen <% so look for "%>" */ |
3171 | 16.8k | if (c != '%') |
3172 | 10.1k | continue; |
3173 | | |
3174 | | /* now look for '>' */ |
3175 | 6.78k | c = TY_(ReadChar)(doc->docIn); |
3176 | | |
3177 | | |
3178 | 6.78k | if (c != '>') |
3179 | 5.29k | { |
3180 | 5.29k | TY_(UngetChar)(c, doc->docIn); |
3181 | 5.29k | continue; |
3182 | 5.29k | } |
3183 | | |
3184 | 1.49k | lexer->lexsize -= 1; |
3185 | 1.49k | lexer->txtend = lexer->lexsize; |
3186 | 1.49k | lexer->lexbuf[lexer->lexsize] = '\0'; |
3187 | 1.49k | lexer->state = LEX_CONTENT; |
3188 | 1.49k | lexer->waswhite = no; |
3189 | 1.49k | lexer->token = AspToken(doc); |
3190 | 1.49k | node = lexer->token; |
3191 | 1.49k | GTDBG(doc,"ASP", node); |
3192 | 1.49k | return node; /* the endtag token */ |
3193 | | |
3194 | | |
3195 | | |
3196 | 976k | case LEX_JSTE: /* seen <# so look for "#>" */ |
3197 | 976k | if (c != '#') |
3198 | 974k | continue; |
3199 | | |
3200 | | /* now look for '>' */ |
3201 | 1.27k | c = TY_(ReadChar)(doc->docIn); |
3202 | | |
3203 | | |
3204 | 1.27k | if (c != '>') |
3205 | 438 | { |
3206 | 438 | TY_(UngetChar)(c, doc->docIn); |
3207 | 438 | continue; |
3208 | 438 | } |
3209 | | |
3210 | 837 | lexer->lexsize -= 1; |
3211 | 837 | lexer->txtend = lexer->lexsize; |
3212 | 837 | lexer->lexbuf[lexer->lexsize] = '\0'; |
3213 | 837 | lexer->state = LEX_CONTENT; |
3214 | 837 | lexer->waswhite = no; |
3215 | 837 | lexer->token = JsteToken(doc); |
3216 | 837 | node = lexer->token; |
3217 | 837 | GTDBG(doc,"JSTE", node); |
3218 | 837 | return node; /* the JSTE token */ |
3219 | | |
3220 | | |
3221 | 926 | case LEX_PHP: /* seen "<?php" so look for "?>" */ |
3222 | 926 | if (c != '?') |
3223 | 374 | continue; |
3224 | | |
3225 | | /* now look for '>' */ |
3226 | 552 | c = TY_(ReadChar)(doc->docIn); |
3227 | | |
3228 | 552 | if (c != '>') |
3229 | 347 | { |
3230 | 347 | TY_(UngetChar)(c, doc->docIn); |
3231 | 347 | continue; |
3232 | 347 | } |
3233 | | |
3234 | 205 | lexer->lexsize -= 1; |
3235 | 205 | lexer->txtend = lexer->lexsize; |
3236 | 205 | lexer->lexbuf[lexer->lexsize] = '\0'; |
3237 | 205 | lexer->state = LEX_CONTENT; |
3238 | 205 | lexer->waswhite = no; |
3239 | 205 | lexer->token = PhpToken(doc); |
3240 | 205 | node = lexer->token; |
3241 | 205 | GTDBG(doc,"PHP", node); |
3242 | 205 | return node; /* the PHP token */ |
3243 | | |
3244 | 13.2k | case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */ |
3245 | | |
3246 | 13.2k | if (TY_(IsWhite)(c) && c != '?') |
3247 | 4.63k | continue; |
3248 | | |
3249 | | /* get pseudo-attribute */ |
3250 | 8.63k | if (c != '?') |
3251 | 6.63k | { |
3252 | 6.63k | tmbstr name; |
3253 | 6.63k | Node *asp, *php; |
3254 | 6.63k | AttVal *av = NULL; |
3255 | 6.63k | int pdelim = 0; |
3256 | 6.63k | isempty = no; |
3257 | | |
3258 | 6.63k | TY_(UngetChar)(c, doc->docIn); |
3259 | | |
3260 | 6.63k | name = ParseAttribute( doc, &isempty, &asp, &php ); |
3261 | | |
3262 | 6.63k | if (!name) |
3263 | 3.64k | { |
3264 | | /* check if attributes are created by ASP markup */ |
3265 | 3.64k | if (asp) |
3266 | 661 | { |
3267 | 661 | av = TY_(NewAttribute)(doc); |
3268 | 661 | av->asp = asp; |
3269 | 661 | AddAttrToList( &attributes, av ); |
3270 | 661 | } |
3271 | | |
3272 | | /* check if attributes are created by PHP markup */ |
3273 | 3.64k | if (php) |
3274 | 1.71k | { |
3275 | 1.71k | av = TY_(NewAttribute)(doc); |
3276 | 1.71k | av->php = php; |
3277 | 1.71k | AddAttrToList( &attributes, av ); |
3278 | 1.71k | } |
3279 | | |
3280 | | /* fix for http://tidy.sf.net/bug/788031 */ |
3281 | 3.64k | lexer->lexsize -= 1; |
3282 | 3.64k | lexer->txtend = lexer->txtstart; |
3283 | 3.64k | lexer->lexbuf[lexer->txtend] = '\0'; |
3284 | 3.64k | lexer->state = LEX_CONTENT; |
3285 | 3.64k | lexer->waswhite = no; |
3286 | 3.64k | lexer->token = XmlDeclToken(doc); |
3287 | 3.64k | lexer->token->attributes = attributes; |
3288 | 3.64k | node = lexer->token; |
3289 | 3.64k | GTDBG(doc,"xml", node); |
3290 | 3.64k | return node; /* the xml token */ |
3291 | 3.64k | } |
3292 | | |
3293 | 2.98k | av = TY_(NewAttribute)(doc); |
3294 | 2.98k | av->attribute = name; |
3295 | 2.98k | av->value = ParseValue( doc, name, yes, &isempty, &pdelim ); |
3296 | 2.98k | av->delim = pdelim; |
3297 | 2.98k | av->dict = TY_(FindAttribute)( doc, av ); |
3298 | | |
3299 | 2.98k | AddAttrToList( &attributes, av ); |
3300 | | /* continue; */ |
3301 | 2.98k | } |
3302 | | |
3303 | | /* now look for '>' */ |
3304 | 4.98k | c = TY_(ReadChar)(doc->docIn); |
3305 | | |
3306 | 4.98k | if (c != '>') |
3307 | 3.74k | { |
3308 | 3.74k | TY_(UngetChar)(c, doc->docIn); |
3309 | 3.74k | continue; |
3310 | 3.74k | } |
3311 | 1.23k | lexer->lexsize -= 1; |
3312 | 1.23k | lexer->txtend = lexer->txtstart; |
3313 | 1.23k | lexer->lexbuf[lexer->txtend] = '\0'; |
3314 | 1.23k | lexer->state = LEX_CONTENT; |
3315 | 1.23k | lexer->waswhite = no; |
3316 | 1.23k | lexer->token = XmlDeclToken(doc); |
3317 | 1.23k | lexer->token->attributes = attributes; |
3318 | 1.23k | node = lexer->token; |
3319 | 1.23k | GTDBG(doc,"XML", node); |
3320 | 1.23k | return node; /* the XML token */ |
3321 | | |
3322 | 90.2M | case LEX_SECTION: /* seen "<![" so look for "]>" */ |
3323 | 90.2M | if (c == '[') |
3324 | 2.77k | { |
3325 | 2.77k | if (lexer->lexsize == (lexer->txtstart + 6) && |
3326 | 2.77k | TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0) |
3327 | 1.10k | { |
3328 | 1.10k | lexer->state = LEX_CDATA; |
3329 | 1.10k | lexer->lexsize -= 6; |
3330 | 1.10k | continue; |
3331 | 1.10k | } |
3332 | 2.77k | } |
3333 | | |
3334 | 90.2M | if (c == '>') |
3335 | 1.98k | { |
3336 | | /* Is. #462 - reached '>' before ']' */ |
3337 | 1.98k | TY_(UngetChar)(c, doc->docIn); |
3338 | 90.2M | } else if (c != ']') |
3339 | 90.2M | continue; |
3340 | | |
3341 | | /* now look for '>' */ |
3342 | 8.48k | c = TY_(ReadChar)(doc->docIn); |
3343 | | |
3344 | 8.48k | lexdump = 1; |
3345 | 8.48k | if (c != '>') |
3346 | 6.24k | { |
3347 | | /* Issue #153 - can also be ]'-->' */ |
3348 | 6.24k | if (c == '-') |
3349 | 917 | { |
3350 | 917 | c = TY_(ReadChar)(doc->docIn); |
3351 | 917 | if (c == '-') |
3352 | 546 | { |
3353 | 546 | c = TY_(ReadChar)(doc->docIn); |
3354 | 546 | if (c != '>') |
3355 | 222 | { |
3356 | 222 | TY_(UngetChar)(c, doc->docIn); |
3357 | 222 | TY_(UngetChar)('-', doc->docIn); |
3358 | 222 | TY_(UngetChar)('-', doc->docIn); |
3359 | 222 | continue; |
3360 | 222 | } |
3361 | | /* this failed! |
3362 | | TY_(AddCharToLexer)(lexer, '-'); TY_(AddCharToLexer)(lexer, '-'); lexdump = 0; |
3363 | | got output <![endif]--]> - needs further fix in pprint section output |
3364 | | */ |
3365 | 546 | } |
3366 | 371 | else |
3367 | 371 | { |
3368 | 371 | TY_(UngetChar)(c, doc->docIn); |
3369 | 371 | TY_(UngetChar)('-', doc->docIn); |
3370 | 371 | continue; |
3371 | 371 | } |
3372 | 917 | } |
3373 | 5.32k | else |
3374 | 5.32k | { |
3375 | 5.32k | TY_(UngetChar)(c, doc->docIn); |
3376 | 5.32k | continue; |
3377 | 5.32k | } |
3378 | 6.24k | } |
3379 | | |
3380 | 2.56k | lexer->lexsize -= lexdump; |
3381 | 2.56k | lexer->txtend = lexer->lexsize; |
3382 | 2.56k | lexer->lexbuf[lexer->lexsize] = '\0'; |
3383 | 2.56k | lexer->state = LEX_CONTENT; |
3384 | 2.56k | lexer->waswhite = no; |
3385 | 2.56k | lexer->token = SectionToken(doc); |
3386 | 2.56k | node = lexer->token; |
3387 | 2.56k | GTDBG(doc,"SECTION", node); |
3388 | 2.56k | return node; /* the SECTION token */ |
3389 | | |
3390 | 35.1k | case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */ |
3391 | 35.1k | if (c != ']') |
3392 | 32.2k | continue; |
3393 | | |
3394 | | /* now look for ']' */ |
3395 | 2.91k | c = TY_(ReadChar)(doc->docIn); |
3396 | | |
3397 | 2.91k | if (c != ']') |
3398 | 1.56k | { |
3399 | 1.56k | TY_(UngetChar)(c, doc->docIn); |
3400 | 1.56k | continue; |
3401 | 1.56k | } |
3402 | | |
3403 | | /* now look for '>' */ |
3404 | 1.35k | c = TY_(ReadChar)(doc->docIn); |
3405 | | |
3406 | 1.35k | if (c != '>') |
3407 | 278 | { |
3408 | 278 | TY_(UngetChar)(c, doc->docIn); |
3409 | 278 | TY_(UngetChar)(']', doc->docIn); |
3410 | 278 | continue; |
3411 | 278 | } |
3412 | | |
3413 | 1.07k | lexer->lexsize -= 1; |
3414 | 1.07k | lexer->txtend = lexer->lexsize; |
3415 | 1.07k | lexer->lexbuf[lexer->lexsize] = '\0'; |
3416 | 1.07k | lexer->state = LEX_CONTENT; |
3417 | 1.07k | lexer->waswhite = no; |
3418 | 1.07k | lexer->token = CDATAToken(doc); |
3419 | 1.07k | node = lexer->token; |
3420 | 1.07k | GTDBG(doc,"CDATA", node); |
3421 | 1.07k | return node; /* the CDATA token */ |
3422 | 241M | } |
3423 | 241M | } |
3424 | | |
3425 | 1.50M | if (lexer->state == LEX_CONTENT) /* text string */ |
3426 | 1.43M | { |
3427 | 1.43M | lexer->txtend = lexer->lexsize; |
3428 | | |
3429 | 1.43M | if (lexer->txtend > lexer->txtstart) |
3430 | 3.22k | { |
3431 | 3.22k | TY_(UngetChar)(c, doc->docIn); |
3432 | | |
3433 | 3.22k | if (lexer->lexbuf[lexer->lexsize - 1] == ' ') |
3434 | 238 | { |
3435 | 238 | lexer->lexsize -= 1; |
3436 | 238 | lexer->txtend = lexer->lexsize; |
3437 | 238 | } |
3438 | 3.22k | lexer->token = TY_(TextToken)(lexer); |
3439 | 3.22k | node = lexer->token; |
3440 | 3.22k | GTDBG(doc,"textstring", node); |
3441 | 3.22k | return node; /* the textstring token */ |
3442 | 3.22k | } |
3443 | 1.43M | } |
3444 | 71.4k | else if (lexer->state == LEX_COMMENT) /* comment */ |
3445 | 90 | { |
3446 | 90 | if (c == EndOfStream) |
3447 | 90 | { |
3448 | | /* We print this if we reached end of the stream mid-comment. */ |
3449 | 90 | TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_EOS ); |
3450 | 90 | } |
3451 | | |
3452 | 90 | lexer->txtend = lexer->lexsize; |
3453 | 90 | lexer->lexbuf[lexer->lexsize] = '\0'; |
3454 | 90 | lexer->state = LEX_CONTENT; |
3455 | 90 | lexer->waswhite = no; |
3456 | 90 | lexer->token = CommentToken(doc); |
3457 | 90 | node = lexer->token; |
3458 | 90 | GTDBG(doc,"COMMENT", node); |
3459 | 90 | return node; /* the COMMENT token */ |
3460 | 90 | } |
3461 | | |
3462 | | /* check attributes before return NULL */ |
3463 | 1.50M | if (attributes) |
3464 | 61 | TY_(FreeAttribute)( doc, attributes ); |
3465 | | |
3466 | 1.50M | DEBUG_LOG(SPRTF("Returning NULL...\n")); |
3467 | 1.50M | return NULL; |
3468 | 1.50M | } |
3469 | | |
3470 | | static void MapStr( ctmbstr str, uint code ) |
3471 | 119k | { |
3472 | 1.53M | while ( *str ) |
3473 | 1.41M | { |
3474 | 1.41M | uint i = (byte) *str++; |
3475 | 1.41M | lexmap[i] |= code; |
3476 | 1.41M | } |
3477 | 119k | } |
3478 | | |
3479 | | void TY_(InitMap)(void) |
3480 | 17.0k | { |
3481 | 17.0k | MapStr("\r\n\f", newline|white); |
3482 | 17.0k | MapStr(" \t", white); |
3483 | 17.0k | MapStr("-.:_", namechar); |
3484 | 17.0k | MapStr("0123456789", digit|digithex|namechar); |
3485 | 17.0k | MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar); |
3486 | 17.0k | MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar); |
3487 | 17.0k | MapStr("abcdefABCDEF", digithex); |
3488 | 17.0k | } |
3489 | | |
3490 | | /* |
3491 | | parser for ASP within start tags |
3492 | | |
3493 | | Some people use ASP for to customize attributes |
3494 | | Tidy isn't really well suited to dealing with ASP |
3495 | | This is a workaround for attributes, but won't |
3496 | | deal with the case where the ASP is used to tailor |
3497 | | the attribute value. Here is an example of a work |
3498 | | around for using ASP in attribute values: |
3499 | | |
3500 | | href='<%=rsSchool.Fields("ID").Value%>' |
3501 | | |
3502 | | where the ASP that generates the attribute value |
3503 | | is masked from Tidy by the quotemarks. |
3504 | | |
3505 | | */ |
3506 | | |
3507 | | static Node *ParseAsp( TidyDocImpl* doc ) |
3508 | 1.71k | { |
3509 | 1.71k | Lexer* lexer = doc->lexer; |
3510 | 1.71k | uint c; |
3511 | 1.71k | Node *asp = NULL; |
3512 | | |
3513 | 1.71k | lexer->txtstart = lexer->lexsize; |
3514 | | |
3515 | 1.71k | for (;;) |
3516 | 1.13M | { |
3517 | 1.13M | if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) |
3518 | 87 | break; |
3519 | | |
3520 | 1.13M | TY_(AddCharToLexer)(lexer, c); |
3521 | | |
3522 | | |
3523 | 1.13M | if (c != '%') |
3524 | 1.10M | continue; |
3525 | | |
3526 | 38.2k | if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) |
3527 | 7 | break; |
3528 | | |
3529 | 38.2k | TY_(AddCharToLexer)(lexer, c); |
3530 | | |
3531 | 38.2k | if (c == '>') |
3532 | 1.62k | { |
3533 | 1.62k | lexer->lexsize -= 2; |
3534 | 1.62k | break; |
3535 | 1.62k | } |
3536 | 38.2k | } |
3537 | | |
3538 | 1.71k | lexer->txtend = lexer->lexsize; |
3539 | 1.71k | if (lexer->txtend > lexer->txtstart) |
3540 | 1.46k | asp = AspToken(doc); |
3541 | | |
3542 | 1.71k | lexer->txtstart = lexer->txtend; |
3543 | 1.71k | return asp; |
3544 | 1.71k | } |
3545 | | |
3546 | | |
3547 | | /* |
3548 | | PHP is like ASP but is based upon XML |
3549 | | processing instructions, e.g. <?php ... ?> |
3550 | | */ |
3551 | | static Node *ParsePhp( TidyDocImpl* doc ) |
3552 | 4.00k | { |
3553 | 4.00k | Lexer* lexer = doc->lexer; |
3554 | 4.00k | uint c; |
3555 | 4.00k | Node *php = NULL; |
3556 | | |
3557 | 4.00k | lexer->txtstart = lexer->lexsize; |
3558 | | |
3559 | 4.00k | for (;;) |
3560 | 3.57M | { |
3561 | 3.57M | if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) |
3562 | 116 | break; |
3563 | | |
3564 | 3.57M | TY_(AddCharToLexer)(lexer, c); |
3565 | | |
3566 | | |
3567 | 3.57M | if (c != '?') |
3568 | 3.56M | continue; |
3569 | | |
3570 | 6.41k | if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream) |
3571 | 9 | break; |
3572 | | |
3573 | 6.40k | TY_(AddCharToLexer)(lexer, c); |
3574 | | |
3575 | 6.40k | if (c == '>') |
3576 | 3.88k | { |
3577 | 3.88k | lexer->lexsize -= 2; |
3578 | 3.88k | break; |
3579 | 3.88k | } |
3580 | 6.40k | } |
3581 | | |
3582 | 4.00k | lexer->txtend = lexer->lexsize; |
3583 | 4.00k | if (lexer->txtend > lexer->txtstart) |
3584 | 3.75k | php = PhpToken(doc); |
3585 | | |
3586 | 4.00k | lexer->txtstart = lexer->txtend; |
3587 | 4.00k | return php; |
3588 | 4.00k | } |
3589 | | |
3590 | | /* consumes the '>' terminating start tags */ |
3591 | | /* @TODO: float the errors back to the calling method */ |
3592 | | static tmbstr ParseAttribute( TidyDocImpl* doc, Bool *isempty, |
3593 | | Node **asp, Node **php ) |
3594 | 682k | { |
3595 | 682k | Lexer* lexer = doc->lexer; |
3596 | 682k | int start, len = 0; |
3597 | 682k | tmbstr attr = NULL; |
3598 | 682k | uint c, lastc; |
3599 | | |
3600 | 682k | *asp = NULL; /* clear asp pointer */ |
3601 | 682k | *php = NULL; /* clear php pointer */ |
3602 | | |
3603 | | /* skip white space before the attribute */ |
3604 | | |
3605 | 682k | for (;;) |
3606 | 787k | { |
3607 | 787k | c = TY_(ReadChar)( doc->docIn ); |
3608 | | |
3609 | | |
3610 | 787k | if (c == '/') |
3611 | 69.8k | { |
3612 | 69.8k | c = TY_(ReadChar)( doc->docIn ); |
3613 | | |
3614 | 69.8k | if (c == '>') |
3615 | 12.7k | { |
3616 | 12.7k | *isempty = yes; |
3617 | 12.7k | return NULL; |
3618 | 12.7k | } |
3619 | | |
3620 | 57.0k | TY_(UngetChar)(c, doc->docIn); |
3621 | 57.0k | c = '/'; |
3622 | 57.0k | break; |
3623 | 69.8k | } |
3624 | | |
3625 | 718k | if (c == '>') |
3626 | 78.9k | return NULL; |
3627 | | |
3628 | 639k | if (c =='<') |
3629 | 299k | { |
3630 | 299k | c = TY_(ReadChar)(doc->docIn); |
3631 | | |
3632 | 299k | if (c == '%') |
3633 | 1.71k | { |
3634 | 1.71k | *asp = ParseAsp( doc ); |
3635 | 1.71k | return NULL; |
3636 | 1.71k | } |
3637 | 297k | else if (c == '?') |
3638 | 4.00k | { |
3639 | 4.00k | *php = ParsePhp( doc ); |
3640 | 4.00k | return NULL; |
3641 | 4.00k | } |
3642 | | |
3643 | 293k | TY_(UngetChar)(c, doc->docIn); |
3644 | 293k | TY_(UngetChar)('<', doc->docIn); |
3645 | 293k | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); |
3646 | 293k | return NULL; |
3647 | 299k | } |
3648 | | |
3649 | 340k | if (c == '=') |
3650 | 3.44k | { |
3651 | 3.44k | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN ); |
3652 | 3.44k | continue; |
3653 | 3.44k | } |
3654 | | |
3655 | 336k | if (c == '"' || c == '\'') |
3656 | 4.94k | { |
3657 | 4.94k | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK ); |
3658 | 4.94k | continue; |
3659 | 4.94k | } |
3660 | | |
3661 | 331k | if (c == EndOfStream) |
3662 | 63 | { |
3663 | 63 | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); |
3664 | 63 | TY_(UngetChar)(c, doc->docIn); |
3665 | 63 | return NULL; |
3666 | 63 | } |
3667 | | |
3668 | | |
3669 | 331k | if (!TY_(IsWhite)(c)) |
3670 | 234k | break; |
3671 | 331k | } |
3672 | | |
3673 | 291k | start = lexer->lexsize; |
3674 | 291k | lastc = c; |
3675 | | |
3676 | 291k | for (;;) |
3677 | 1.31M | { |
3678 | | /* but push back '=' for parseValue() */ |
3679 | 1.31M | if (c == '=' || c == '>') |
3680 | 92.2k | { |
3681 | 92.2k | TY_(UngetChar)(c, doc->docIn); |
3682 | 92.2k | break; |
3683 | 92.2k | } |
3684 | | |
3685 | 1.21M | if (c == '<' || c == EndOfStream) |
3686 | 119k | { |
3687 | 119k | TY_(UngetChar)(c, doc->docIn); |
3688 | 119k | break; |
3689 | 119k | } |
3690 | | |
3691 | 1.09M | if (lastc == '-' && (c == '"' || c == '\'')) |
3692 | 527 | { |
3693 | 527 | lexer->lexsize--; |
3694 | 527 | --len; |
3695 | 527 | TY_(UngetChar)(c, doc->docIn); |
3696 | 527 | break; |
3697 | 527 | } |
3698 | | |
3699 | 1.09M | if (TY_(IsWhite)(c)) |
3700 | 78.3k | break; |
3701 | | |
3702 | 1.01M | if (c == '/') /* Issue #395 - potential self closing tag */ |
3703 | 61.5k | { |
3704 | 61.5k | c = TY_(ReadChar)(doc->docIn); /* read next */ |
3705 | 61.5k | if (c == '>') |
3706 | 1.18k | { |
3707 | | /* got a self closing tag - put is back and continue... */ |
3708 | 1.18k | TY_(UngetChar)(c, doc->docIn); |
3709 | 1.18k | break; |
3710 | 1.18k | } |
3711 | 60.3k | else |
3712 | 60.3k | { |
3713 | | /* Not '/>' - put it back */ |
3714 | 60.3k | TY_(UngetChar)(c, doc->docIn); |
3715 | 60.3k | c = '/'; /* restore original char */ |
3716 | 60.3k | } |
3717 | 61.5k | } |
3718 | | |
3719 | | /* what should be done about non-namechar characters? */ |
3720 | | /* currently these are incorporated into the attr name */ |
3721 | | |
3722 | 1.01M | if ( cfg(doc, TidyUpperCaseAttrs) != TidyUppercasePreserve ) |
3723 | 1.01M | { |
3724 | 1.01M | if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) ) |
3725 | 167k | c = TY_(ToLower)(c); |
3726 | 1.01M | } |
3727 | | |
3728 | 1.01M | TY_(AddCharToLexer)( lexer, c ); |
3729 | 1.01M | lastc = c; |
3730 | 1.01M | c = TY_(ReadChar)(doc->docIn); |
3731 | 1.01M | } |
3732 | | |
3733 | | /* handle attribute names with multibyte chars */ |
3734 | 291k | len = lexer->lexsize - start; |
3735 | 291k | attr = (len > 0 ? TY_(tmbstrndup)(doc->allocator, |
3736 | 291k | lexer->lexbuf+start, len) : NULL); |
3737 | 291k | lexer->lexsize = start; |
3738 | 291k | return attr; |
3739 | 682k | } |
3740 | | |
3741 | | /* |
3742 | | invoked when < is seen in place of attribute value |
3743 | | but terminates on whitespace if not ASP, PHP or Tango |
3744 | | this routine recognizes ' and " quoted strings |
3745 | | */ |
3746 | | static int ParseServerInstruction( TidyDocImpl* doc ) |
3747 | 2.70k | { |
3748 | 2.70k | Lexer* lexer = doc->lexer; |
3749 | 2.70k | uint c; |
3750 | 2.70k | int delim = '"'; |
3751 | 2.70k | Bool isrule = no; |
3752 | | |
3753 | 2.70k | c = TY_(ReadChar)(doc->docIn); |
3754 | 2.70k | TY_(AddCharToLexer)(lexer, c); |
3755 | | |
3756 | | /* check for ASP, PHP or Tango */ |
3757 | 2.70k | if (c == '%' || c == '?' || c == '@') |
3758 | 761 | isrule = yes; |
3759 | | |
3760 | 2.70k | for (;;) |
3761 | 21.8k | { |
3762 | 21.8k | c = TY_(ReadChar)(doc->docIn); |
3763 | | |
3764 | 21.8k | if (c == EndOfStream) |
3765 | 133 | break; |
3766 | | |
3767 | 21.7k | if (c == '>') |
3768 | 1.53k | { |
3769 | 1.53k | if (isrule) |
3770 | 724 | TY_(AddCharToLexer)(lexer, c); |
3771 | 808 | else |
3772 | 808 | TY_(UngetChar)(c, doc->docIn); |
3773 | | |
3774 | 1.53k | break; |
3775 | 1.53k | } |
3776 | | |
3777 | | /* if not recognized as ASP, PHP or Tango */ |
3778 | | /* then also finish value on whitespace */ |
3779 | 20.1k | if (!isrule) |
3780 | 7.28k | { |
3781 | 7.28k | if (TY_(IsWhite)(c)) |
3782 | 686 | break; |
3783 | 7.28k | } |
3784 | | |
3785 | 19.5k | TY_(AddCharToLexer)(lexer, c); |
3786 | | |
3787 | 19.5k | if (c == '"') |
3788 | 482 | { |
3789 | 482 | do |
3790 | 1.71k | { |
3791 | 1.71k | c = TY_(ReadChar)(doc->docIn); |
3792 | 1.71k | if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */ |
3793 | 27 | { |
3794 | 27 | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); |
3795 | 27 | TY_(UngetChar)(c, doc->docIn); |
3796 | 27 | return 0; |
3797 | 27 | } |
3798 | 1.69k | if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */ |
3799 | 220 | { |
3800 | 220 | TY_(UngetChar)(c, doc->docIn); |
3801 | 220 | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); |
3802 | 220 | return 0; |
3803 | 220 | } |
3804 | 1.47k | TY_(AddCharToLexer)(lexer, c); |
3805 | 1.47k | } |
3806 | 1.47k | while (c != '"'); |
3807 | 235 | delim = '\''; |
3808 | 235 | continue; |
3809 | 482 | } |
3810 | | |
3811 | 19.0k | if (c == '\'') |
3812 | 411 | { |
3813 | 411 | do |
3814 | 771k | { |
3815 | 771k | c = TY_(ReadChar)(doc->docIn); |
3816 | 771k | if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */ |
3817 | 26 | { |
3818 | 26 | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); |
3819 | 26 | TY_(UngetChar)(c, doc->docIn); |
3820 | 26 | return 0; |
3821 | 26 | } |
3822 | 771k | if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */ |
3823 | 85 | { |
3824 | 85 | TY_(UngetChar)(c, doc->docIn); |
3825 | 85 | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); |
3826 | 85 | return 0; |
3827 | 85 | } |
3828 | 771k | TY_(AddCharToLexer)(lexer, c); |
3829 | 771k | } |
3830 | 771k | while (c != '\''); |
3831 | 411 | } |
3832 | 19.0k | } |
3833 | | |
3834 | 2.35k | return delim; |
3835 | 2.70k | } |
3836 | | |
3837 | | /* values start with "=" or " = " etc. */ |
3838 | | /* doesn't consume the ">" at end of start tag */ |
3839 | | |
3840 | | static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, |
3841 | | Bool foldCase, Bool *isempty, int *pdelim) |
3842 | 291k | { |
3843 | 291k | Lexer* lexer = doc->lexer; |
3844 | 291k | int len = 0, start; |
3845 | 291k | Bool seen_gt = no; |
3846 | 291k | Bool munge = yes; |
3847 | 291k | uint c, lastc, delim, quotewarning; |
3848 | 291k | tmbstr value; |
3849 | | |
3850 | 291k | delim = (tmbchar) 0; |
3851 | 291k | *pdelim = '"'; |
3852 | | |
3853 | | /* |
3854 | | Henry Zrepa reports that some folk are using the |
3855 | | embed element with script attributes where newlines |
3856 | | are significant and must be preserved |
3857 | | */ |
3858 | 291k | if ( cfgBool(doc, TidyLiteralAttribs) ) |
3859 | 0 | munge = no; |
3860 | | |
3861 | | /* skip white space before the '=' */ |
3862 | | |
3863 | 291k | for (;;) |
3864 | 820k | { |
3865 | 820k | c = TY_(ReadChar)(doc->docIn); |
3866 | | |
3867 | 820k | if (c == EndOfStream) |
3868 | 1.18k | { |
3869 | 1.18k | TY_(UngetChar)(c, doc->docIn); |
3870 | 1.18k | break; |
3871 | 1.18k | } |
3872 | | |
3873 | 819k | if (!TY_(IsWhite)(c)) |
3874 | 290k | break; |
3875 | 819k | } |
3876 | | |
3877 | | /* |
3878 | | c should be '=' if there is a value |
3879 | | other legal possibilities are white |
3880 | | space, '/' and '>' |
3881 | | */ |
3882 | | |
3883 | 291k | if (c != '=' && c != '"' && c != '\'') |
3884 | 229k | { |
3885 | 229k | TY_(UngetChar)(c, doc->docIn); |
3886 | 229k | return NULL; |
3887 | 229k | } |
3888 | | |
3889 | | /* skip white space after '=' */ |
3890 | | |
3891 | 61.7k | for (;;) |
3892 | 64.0k | { |
3893 | 64.0k | c = TY_(ReadChar)(doc->docIn); |
3894 | | |
3895 | 64.0k | if (c == EndOfStream) |
3896 | 60 | { |
3897 | 60 | TY_(UngetChar)(c, doc->docIn); |
3898 | 60 | break; |
3899 | 60 | } |
3900 | | |
3901 | 64.0k | if (!TY_(IsWhite)(c)) |
3902 | 61.6k | break; |
3903 | 64.0k | } |
3904 | | |
3905 | | /* check for quote marks */ |
3906 | | |
3907 | 61.7k | if (c == '"' || c == '\'') |
3908 | 19.6k | delim = c; |
3909 | 42.0k | else if (c == '<') |
3910 | 2.70k | { |
3911 | 2.70k | start = lexer->lexsize; |
3912 | 2.70k | TY_(AddCharToLexer)(lexer, c); |
3913 | 2.70k | *pdelim = ParseServerInstruction( doc ); |
3914 | 2.70k | len = lexer->lexsize - start; |
3915 | 2.70k | lexer->lexsize = start; |
3916 | 2.70k | return (len > 0 ? TY_(tmbstrndup)(doc->allocator, |
3917 | 2.70k | lexer->lexbuf+start, len) : NULL); |
3918 | 2.70k | } |
3919 | 39.3k | else |
3920 | 39.3k | TY_(UngetChar)(c, doc->docIn); |
3921 | | |
3922 | | /* |
3923 | | and read the value string |
3924 | | check for quote mark if needed |
3925 | | */ |
3926 | | |
3927 | 59.0k | quotewarning = 0; |
3928 | 59.0k | start = lexer->lexsize; |
3929 | 59.0k | c = '\0'; |
3930 | | |
3931 | 59.0k | for (;;) |
3932 | 2.08M | { |
3933 | 2.08M | lastc = c; /* track last character */ |
3934 | 2.08M | c = TY_(ReadChar)(doc->docIn); |
3935 | | |
3936 | 2.08M | if (c == EndOfStream) |
3937 | 1.10k | { |
3938 | 1.10k | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR ); |
3939 | 1.10k | TY_(UngetChar)(c, doc->docIn); |
3940 | 1.10k | break; |
3941 | 1.10k | } |
3942 | | |
3943 | 2.08M | if (delim == (tmbchar)0) |
3944 | 257k | { |
3945 | 257k | if (c == '>') |
3946 | 2.67k | { |
3947 | 2.67k | TY_(UngetChar)(c, doc->docIn); |
3948 | 2.67k | break; |
3949 | 2.67k | } |
3950 | | |
3951 | 255k | if (c == '"' || c == '\'') |
3952 | 4.19k | { |
3953 | 4.19k | uint q = c; |
3954 | | |
3955 | | /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */ |
3956 | | /* this doesn't handle <a title=foo"/> which browsers treat as */ |
3957 | | /* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */ |
3958 | | |
3959 | 4.19k | c = TY_(ReadChar)(doc->docIn); |
3960 | 4.19k | if (c == '>') |
3961 | 637 | { |
3962 | 637 | TY_(AddCharToLexer)(lexer, q); |
3963 | 637 | TY_(UngetChar)(c, doc->docIn); |
3964 | 637 | break; |
3965 | 637 | } |
3966 | 3.55k | else |
3967 | 3.55k | { |
3968 | 3.55k | TY_(UngetChar)(c, doc->docIn); |
3969 | 3.55k | c = q; |
3970 | 3.55k | } |
3971 | 4.19k | } |
3972 | | |
3973 | 254k | if (c == '<') |
3974 | 23.8k | { |
3975 | 23.8k | TY_(UngetChar)(c, doc->docIn); |
3976 | 23.8k | c = '>'; |
3977 | 23.8k | TY_(UngetChar)(c, doc->docIn); |
3978 | 23.8k | TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT ); |
3979 | 23.8k | break; |
3980 | 23.8k | } |
3981 | | |
3982 | | /* |
3983 | | For cases like <br clear=all/> need to avoid treating /> as |
3984 | | part of the attribute value, however care is needed to avoid |
3985 | | so treating <a href=http://www.acme.com/> in this way, which |
3986 | | would map the <a> tag to <a href="http://www.acme.com"/> |
3987 | | */ |
3988 | 230k | if (c == '/') |
3989 | 2.78k | { |
3990 | | /* peek ahead in case of /> */ |
3991 | 2.78k | c = TY_(ReadChar)(doc->docIn); |
3992 | | |
3993 | 2.78k | if ( c == '>' && !TY_(IsUrl)(doc, name) ) |
3994 | 1.07k | { |
3995 | 1.07k | *isempty = yes; |
3996 | 1.07k | TY_(UngetChar)(c, doc->docIn); |
3997 | 1.07k | break; |
3998 | 1.07k | } |
3999 | | |
4000 | | /* unget peeked character */ |
4001 | 1.70k | TY_(UngetChar)(c, doc->docIn); |
4002 | 1.70k | c = '/'; |
4003 | 1.70k | } |
4004 | 230k | } |
4005 | 1.82M | else /* delim is '\'' or '"' */ |
4006 | 1.82M | { |
4007 | 1.82M | if (c == delim) |
4008 | 19.4k | break; |
4009 | | |
4010 | 1.80M | if (c == '\n' || c == '<' || c == '>') |
4011 | 849k | ++quotewarning; |
4012 | | |
4013 | 1.80M | if (c == '>') |
4014 | 23.7k | seen_gt = yes; |
4015 | 1.80M | } |
4016 | | |
4017 | 2.03M | if (c == '&') |
4018 | 10.0k | { |
4019 | 10.0k | TY_(AddCharToLexer)(lexer, c); |
4020 | 10.0k | ParseEntity( doc, IgnoreWhitespace ); |
4021 | 10.0k | if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge) |
4022 | 977 | ChangeChar(lexer, ' '); |
4023 | 10.0k | continue; |
4024 | 10.0k | } |
4025 | | |
4026 | | /* |
4027 | | kludge for JavaScript attribute values |
4028 | | with line continuations in string literals |
4029 | | */ |
4030 | 2.02M | if (c == '\\') |
4031 | 1.19k | { |
4032 | 1.19k | c = TY_(ReadChar)(doc->docIn); |
4033 | | |
4034 | 1.19k | if (c != '\n') |
4035 | 661 | { |
4036 | 661 | TY_(UngetChar)(c, doc->docIn); |
4037 | 661 | c = '\\'; |
4038 | 661 | } |
4039 | 1.19k | } |
4040 | | |
4041 | 2.02M | if (TY_(IsWhite)(c)) |
4042 | 1.59M | { |
4043 | 1.59M | if ( delim == 0 ) |
4044 | 10.1k | break; |
4045 | | |
4046 | 1.58M | if (munge) |
4047 | 1.58M | { |
4048 | | /* discard line breaks in quoted URLs */ |
4049 | | /* #438650 - fix by Randy Waki */ |
4050 | 1.58M | if ( c == '\n' && TY_(IsUrl)(doc, name) ) |
4051 | 765 | { |
4052 | | /* warn that we discard this newline */ |
4053 | 765 | TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI); |
4054 | 765 | continue; |
4055 | 765 | } |
4056 | | |
4057 | 1.58M | c = ' '; |
4058 | | |
4059 | 1.58M | if (lastc == ' ') |
4060 | 1.57M | { |
4061 | 1.57M | if (TY_(IsUrl)(doc, name) ) |
4062 | 1.28k | TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI); |
4063 | 1.57M | continue; |
4064 | 1.57M | } |
4065 | 1.58M | } |
4066 | 1.58M | } |
4067 | 433k | else if (foldCase && TY_(IsUpper)(c)) |
4068 | 455 | c = TY_(ToLower)(c); |
4069 | | |
4070 | 439k | TY_(AddCharToLexer)(lexer, c); |
4071 | 439k | } |
4072 | | |
4073 | 59.0k | if (quotewarning > 10 && seen_gt && munge) |
4074 | 3.32k | { |
4075 | | /* |
4076 | | there is almost certainly a missing trailing quote mark |
4077 | | as we have see too many newlines, < or > characters. |
4078 | | |
4079 | | an exception is made for Javascript attributes and the |
4080 | | javascript URL scheme which may legitimately include < and >, |
4081 | | and for attributes starting with "<xml " as generated by |
4082 | | Microsoft Office. |
4083 | | */ |
4084 | 3.32k | if ( !TY_(IsScript)(doc, name) && |
4085 | 3.32k | !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) && |
4086 | 3.32k | !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0) |
4087 | 3.32k | ) |
4088 | 884 | TY_(Report)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE ); |
4089 | 3.32k | } |
4090 | | |
4091 | 59.0k | len = lexer->lexsize - start; |
4092 | 59.0k | lexer->lexsize = start; |
4093 | | |
4094 | | |
4095 | 59.0k | if (len > 0 || delim) |
4096 | 57.2k | { |
4097 | | /* ignore leading and trailing white space for all but title, alt, value */ |
4098 | | /* and prompts attributes unless --literal-attributes is set to yes */ |
4099 | | /* #994841 - Whitespace is removed from value attributes */ |
4100 | | |
4101 | | /* Issue #217 - Also only if/while (len > 0) - MUST NEVER GO NEGATIVE! */ |
4102 | 57.2k | if ((len > 0) && munge && |
4103 | 57.2k | TY_(tmbstrcasecmp)(name, "alt") && |
4104 | 57.2k | TY_(tmbstrcasecmp)(name, "title") && |
4105 | 57.2k | TY_(tmbstrcasecmp)(name, "value") && |
4106 | 57.2k | TY_(tmbstrcasecmp)(name, "prompt")) |
4107 | 55.6k | { |
4108 | 56.6k | while (TY_(IsWhite)(lexer->lexbuf[start+len-1]) && (len > 0)) |
4109 | 958 | --len; |
4110 | | |
4111 | | /* Issue #497 - Fix leading space trimming */ |
4112 | 56.1k | while (TY_(IsWhite)(lexer->lexbuf[start]) && (len > 0)) |
4113 | 460 | { |
4114 | 460 | ++start; |
4115 | 460 | --len; |
4116 | 460 | } |
4117 | 55.6k | } |
4118 | | |
4119 | 57.2k | value = TY_(tmbstrndup)(doc->allocator, lexer->lexbuf + start, len); |
4120 | 57.2k | } |
4121 | 1.83k | else |
4122 | 1.83k | value = NULL; |
4123 | | |
4124 | | /* note delimiter if given */ |
4125 | 59.0k | *pdelim = delim; |
4126 | | |
4127 | 59.0k | return value; |
4128 | 61.7k | } |
4129 | | |
4130 | | /* attr must be non-NULL */ |
4131 | | static Bool IsValidAttrName( ctmbstr attr ) |
4132 | 288k | { |
4133 | 288k | uint i, c = attr[0]; |
4134 | | |
4135 | | /* first character should be a letter */ |
4136 | 288k | if (!TY_(IsLetter)(c)) |
4137 | 134k | return no; |
4138 | | |
4139 | | /* remaining characters should be namechars */ |
4140 | 514k | for( i = 1; i < TY_(tmbstrlen)(attr); i++) |
4141 | 380k | { |
4142 | 380k | c = attr[i]; |
4143 | | |
4144 | 380k | if (TY_(IsNamechar)(c)) |
4145 | 360k | continue; |
4146 | | |
4147 | 19.2k | return no; |
4148 | 380k | } |
4149 | | |
4150 | 134k | return yes; |
4151 | 153k | } |
4152 | | |
4153 | | /* create a new attribute */ |
4154 | | AttVal *TY_(NewAttribute)( TidyDocImpl* doc ) |
4155 | 503k | { |
4156 | 503k | AttVal *av = (AttVal*) TidyDocAlloc( doc, sizeof(AttVal) ); |
4157 | 503k | TidyClearMemory( av, sizeof(AttVal) ); |
4158 | 503k | return av; |
4159 | 503k | } |
4160 | | |
4161 | | /* create a new attribute with given name and value */ |
4162 | | AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, |
4163 | | int delim ) |
4164 | 25.1k | { |
4165 | 25.1k | AttVal *av = TY_(NewAttribute)(doc); |
4166 | 25.1k | av->attribute = TY_(tmbstrdup)(doc->allocator, name); |
4167 | 25.1k | av->value = TY_(tmbstrdup)(doc->allocator, value); |
4168 | 25.1k | av->delim = delim; |
4169 | 25.1k | av->dict = TY_(FindAttribute)( doc, av ); |
4170 | 25.1k | return av; |
4171 | 25.1k | } |
4172 | | |
4173 | | static void AddAttrToList( AttVal** list, AttVal* av ) |
4174 | 148k | { |
4175 | 148k | if ( *list == NULL ) |
4176 | 117k | *list = av; |
4177 | 30.8k | else |
4178 | 30.8k | { |
4179 | 30.8k | AttVal* here = *list; |
4180 | 467k | while ( here->next ) |
4181 | 436k | here = here->next; |
4182 | 30.8k | here->next = av; |
4183 | 30.8k | } |
4184 | 148k | } |
4185 | | |
4186 | | void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ) |
4187 | 6.07k | { |
4188 | 6.07k | AddAttrToList(&node->attributes, av); |
4189 | 6.07k | } |
4190 | | |
4191 | | void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ) |
4192 | 25.1k | { |
4193 | 25.1k | av->next = node->attributes; |
4194 | 25.1k | node->attributes = av; |
4195 | 25.1k | } |
4196 | | |
4197 | | /* swallows closing '>' */ |
4198 | | |
4199 | | static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty ) |
4200 | 391k | { |
4201 | 391k | Lexer* lexer = doc->lexer; |
4202 | 391k | AttVal *av, *list; |
4203 | 391k | tmbstr value; |
4204 | 391k | int delim; |
4205 | 391k | Node *asp, *php; |
4206 | | |
4207 | 391k | list = NULL; |
4208 | | |
4209 | 683k | while ( !EndOfInput(doc) ) |
4210 | 675k | { |
4211 | 675k | tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php ); |
4212 | | |
4213 | 675k | if (attribute == NULL) |
4214 | 387k | { |
4215 | | /* check if attributes are created by ASP markup */ |
4216 | 387k | if (asp) |
4217 | 806 | { |
4218 | 806 | av = TY_(NewAttribute)(doc); |
4219 | 806 | av->asp = asp; |
4220 | 806 | AddAttrToList( &list, av ); |
4221 | 806 | continue; |
4222 | 806 | } |
4223 | | |
4224 | | /* check if attributes are created by PHP markup */ |
4225 | 386k | if (php) |
4226 | 2.03k | { |
4227 | 2.03k | av = TY_(NewAttribute)(doc); |
4228 | 2.03k | av->php = php; |
4229 | 2.03k | AddAttrToList( &list, av ); |
4230 | 2.03k | continue; |
4231 | 2.03k | } |
4232 | | |
4233 | 384k | break; |
4234 | 386k | } |
4235 | | |
4236 | 288k | value = ParseValue( doc, attribute, no, isempty, &delim ); |
4237 | | |
4238 | 288k | if (attribute && (IsValidAttrName(attribute) || |
4239 | 288k | (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute)))) |
4240 | 134k | { |
4241 | 134k | av = TY_(NewAttribute)(doc); |
4242 | 134k | av->delim = delim ? delim : '"'; |
4243 | 134k | av->attribute = attribute; |
4244 | 134k | av->value = value; |
4245 | 134k | av->dict = TY_(FindAttribute)( doc, av ); |
4246 | 134k | AddAttrToList( &list, av ); |
4247 | 134k | if ( !delim && value ) |
4248 | 33.4k | TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK_OPEN); |
4249 | 134k | } |
4250 | 153k | else |
4251 | 153k | { |
4252 | 153k | av = TY_(NewAttribute)(doc); |
4253 | 153k | av->attribute = attribute; |
4254 | 153k | av->value = value; |
4255 | | |
4256 | 153k | if (LastChar(attribute) == '"') |
4257 | 3.15k | TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK); |
4258 | 150k | else if (value == NULL) |
4259 | 131k | TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE); |
4260 | 18.5k | else |
4261 | 18.5k | TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE); |
4262 | | |
4263 | 153k | TY_(FreeAttribute)( doc, av ); |
4264 | 153k | } |
4265 | 288k | } |
4266 | | |
4267 | 391k | return list; |
4268 | 391k | } |
4269 | | |
4270 | | /* |
4271 | | Returns document type declarations like |
4272 | | |
4273 | | <!DOCTYPE foo PUBLIC "fpi" "sysid"> |
4274 | | <!DOCTYPE bar SYSTEM "sysid"> |
4275 | | <!DOCTYPE baz [ <!ENTITY ouml "ö"> ]> |
4276 | | |
4277 | | as |
4278 | | |
4279 | | <foo PUBLIC="fpi" SYSTEM="sysid" /> |
4280 | | <bar SYSTEM="sysid" /> |
4281 | | <baz> <!ENTITY ouml "&#246"> </baz> |
4282 | | */ |
4283 | | static Node *ParseDocTypeDecl(TidyDocImpl* doc) |
4284 | 55.0k | { |
4285 | 55.0k | Lexer *lexer = doc->lexer; |
4286 | 55.0k | int start = lexer->lexsize; |
4287 | 55.0k | ParseDocTypeDeclState state = DT_DOCTYPENAME; |
4288 | 55.0k | uint c; |
4289 | 55.0k | uint delim = 0; |
4290 | 55.0k | Bool hasfpi = yes; |
4291 | | |
4292 | 55.0k | Node* node = TY_(NewNode)(lexer->allocator, lexer); |
4293 | 55.0k | node->type = DocTypeTag; |
4294 | 55.0k | node->start = lexer->txtstart; |
4295 | 55.0k | node->end = lexer->txtend; |
4296 | | |
4297 | 55.0k | lexer->waswhite = no; |
4298 | | |
4299 | | /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */ |
4300 | | |
4301 | 633k | while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream) |
4302 | 633k | { |
4303 | | /* convert newlines to spaces */ |
4304 | 633k | if (state != DT_INTSUBSET) |
4305 | 533k | c = c == '\n' ? ' ' : c; |
4306 | | |
4307 | | /* convert white-space sequences to single space character */ |
4308 | 633k | if (TY_(IsWhite)(c) && state != DT_INTSUBSET) |
4309 | 297k | { |
4310 | 297k | if (!lexer->waswhite) |
4311 | 10.2k | { |
4312 | 10.2k | TY_(AddCharToLexer)(lexer, c); |
4313 | 10.2k | lexer->waswhite = yes; |
4314 | 10.2k | } |
4315 | 286k | else |
4316 | 286k | { |
4317 | | /* discard space */ |
4318 | 286k | continue; |
4319 | 286k | } |
4320 | 297k | } |
4321 | 336k | else |
4322 | 336k | { |
4323 | 336k | TY_(AddCharToLexer)(lexer, c); |
4324 | 336k | lexer->waswhite = no; |
4325 | 336k | } |
4326 | | |
4327 | 346k | switch(state) |
4328 | 346k | { |
4329 | 90.4k | case DT_INTERMEDIATE: |
4330 | | /* determine what's next */ |
4331 | 90.4k | if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S') |
4332 | 2.00k | { |
4333 | 2.00k | start = lexer->lexsize - 1; |
4334 | 2.00k | state = DT_PUBLICSYSTEM; |
4335 | 2.00k | continue; |
4336 | 2.00k | } |
4337 | 88.4k | else if (c == '[') |
4338 | 1.38k | { |
4339 | 1.38k | start = lexer->lexsize; |
4340 | 1.38k | state = DT_INTSUBSET; |
4341 | 1.38k | continue; |
4342 | 1.38k | } |
4343 | 87.0k | else if (c == '\'' || c == '"') |
4344 | 6.11k | { |
4345 | 6.11k | start = lexer->lexsize; |
4346 | 6.11k | delim = c; |
4347 | 6.11k | state = DT_QUOTEDSTRING; |
4348 | 6.11k | continue; |
4349 | 6.11k | } |
4350 | 80.9k | else if (c == '>') |
4351 | 54.7k | { |
4352 | 54.7k | AttVal* si; |
4353 | | |
4354 | 54.7k | node->end = --(lexer->lexsize); |
4355 | | |
4356 | 54.7k | si = TY_(GetAttrByName)(node, "SYSTEM"); |
4357 | 54.7k | if (si) |
4358 | 844 | TY_(CheckUrl)(doc, node, si); |
4359 | | |
4360 | 54.7k | if (!node->element || !IsValidXMLElemName(node->element)) |
4361 | 43.4k | { |
4362 | 43.4k | TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE); |
4363 | 43.4k | TY_(FreeNode)(doc, node); |
4364 | 43.4k | return NULL; |
4365 | 43.4k | } |
4366 | 11.3k | return node; |
4367 | 54.7k | } |
4368 | 26.1k | else |
4369 | 26.1k | { |
4370 | | /* error */ |
4371 | 26.1k | } |
4372 | 26.1k | break; |
4373 | 123k | case DT_DOCTYPENAME: |
4374 | | /* read document type name */ |
4375 | 123k | if (TY_(IsWhite)(c) || c == '>' || c == '[') |
4376 | 54.9k | { |
4377 | 54.9k | node->element = TY_(tmbstrndup)(doc->allocator, |
4378 | 54.9k | lexer->lexbuf + start, |
4379 | 54.9k | lexer->lexsize - start - 1); |
4380 | 54.9k | if (c == '>' || c == '[') |
4381 | 50.2k | { |
4382 | 50.2k | --(lexer->lexsize); |
4383 | 50.2k | TY_(UngetChar)(c, doc->docIn); |
4384 | 50.2k | } |
4385 | | |
4386 | 54.9k | state = DT_INTERMEDIATE; |
4387 | 54.9k | continue; |
4388 | 54.9k | } |
4389 | 68.9k | break; |
4390 | 68.9k | case DT_PUBLICSYSTEM: |
4391 | | /* read PUBLIC/SYSTEM */ |
4392 | 7.06k | if (TY_(IsWhite)(c) || c == '>') |
4393 | 1.98k | { |
4394 | 1.98k | char *attname = TY_(tmbstrndup)(doc->allocator, |
4395 | 1.98k | lexer->lexbuf + start, |
4396 | 1.98k | lexer->lexsize - start - 1); |
4397 | 1.98k | hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0); |
4398 | | |
4399 | 1.98k | TidyDocFree(doc, attname); |
4400 | | |
4401 | | /* todo: report an error if SYSTEM/PUBLIC not uppercase */ |
4402 | | |
4403 | 1.98k | if (c == '>') |
4404 | 769 | { |
4405 | 769 | --(lexer->lexsize); |
4406 | 769 | TY_(UngetChar)(c, doc->docIn); |
4407 | 769 | } |
4408 | | |
4409 | 1.98k | state = DT_INTERMEDIATE; |
4410 | 1.98k | continue; |
4411 | 1.98k | } |
4412 | 5.07k | break; |
4413 | 25.4k | case DT_QUOTEDSTRING: |
4414 | | /* read quoted string */ |
4415 | 25.4k | if (c == delim) |
4416 | 6.07k | { |
4417 | 6.07k | char *value = TY_(tmbstrndup)(doc->allocator, |
4418 | 6.07k | lexer->lexbuf + start, |
4419 | 6.07k | lexer->lexsize - start - 1); |
4420 | 6.07k | AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value); |
4421 | 6.07k | TidyDocFree(doc, value); |
4422 | 6.07k | att->delim = delim; |
4423 | 6.07k | hasfpi = no; |
4424 | 6.07k | state = DT_INTERMEDIATE; |
4425 | 6.07k | delim = 0; |
4426 | 6.07k | continue; |
4427 | 6.07k | } |
4428 | 19.3k | break; |
4429 | 99.7k | case DT_INTSUBSET: |
4430 | | /* read internal subset */ |
4431 | 99.7k | if (c == ']') |
4432 | 1.34k | { |
4433 | 1.34k | Node* subset; |
4434 | 1.34k | lexer->txtstart = start; |
4435 | 1.34k | lexer->txtend = lexer->lexsize - 1; |
4436 | 1.34k | subset = TY_(TextToken)(lexer); |
4437 | 1.34k | TY_(InsertNodeAtEnd)(node, subset); |
4438 | 1.34k | state = DT_INTERMEDIATE; |
4439 | 1.34k | } |
4440 | 99.7k | break; |
4441 | 346k | } |
4442 | 346k | } |
4443 | | |
4444 | | /* document type declaration not finished */ |
4445 | 248 | TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE); |
4446 | 248 | TY_(FreeNode)(doc, node); |
4447 | 248 | return NULL; |
4448 | 55.0k | } |
4449 | | |
4450 | | |
4451 | | /****************************************************************************//* |
4452 | | ** MARK: - Node Stack |
4453 | | ***************************************************************************/ |
4454 | | |
4455 | | |
4456 | | /** |
4457 | | * Create a new stack with a given starting capacity. If memory allocation |
4458 | | * fails, then the allocator will panic the program automatically. |
4459 | | */ |
4460 | | Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity) |
4461 | 17.0k | { |
4462 | 17.0k | Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack)); |
4463 | 17.0k | stack->top = -1; |
4464 | 17.0k | stack->capacity = capacity; |
4465 | 17.0k | stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**)); |
4466 | 17.0k | stack->allocator = doc->allocator; |
4467 | 17.0k | return stack; |
4468 | 17.0k | } |
4469 | | |
4470 | | |
4471 | | /** |
4472 | | * Increase the stack size. This will be called automatically when the |
4473 | | * current stack is full. If memory allocation fails, then the allocator |
4474 | | * will panic the program automatically. |
4475 | | */ |
4476 | | void TY_(growStack)(Stack *stack) |
4477 | 44 | { |
4478 | 44 | uint new_capacity = stack->capacity * 2; |
4479 | | |
4480 | 44 | Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity * sizeof(Node**)); |
4481 | | |
4482 | 44 | memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) ); |
4483 | 44 | TidyFree(stack->allocator, stack->firstNode); |
4484 | | |
4485 | 44 | stack->firstNode = firstNode; |
4486 | 44 | stack->capacity = new_capacity; |
4487 | 44 | } |
4488 | | |
4489 | | |
4490 | | /** |
4491 | | * Stack is full when top is equal to the last index. |
4492 | | */ |
4493 | | Bool TY_(stackFull)(Stack *stack) |
4494 | 1.68M | { |
4495 | 1.68M | return stack->top == stack->capacity - 1; |
4496 | 1.68M | } |
4497 | | |
4498 | | |
4499 | | /** |
4500 | | * Stack is empty when top is equal to -1 |
4501 | | */ |
4502 | | Bool TY_(stackEmpty)(Stack *stack) |
4503 | 140k | { |
4504 | 140k | return stack->top == -1; |
4505 | 140k | } |
4506 | | |
4507 | | |
4508 | | /** |
4509 | | * Push an item to the stack. |
4510 | | */ |
4511 | | void TY_(push)(Stack *stack, Node *node) |
4512 | 1.68M | { |
4513 | 1.68M | if (TY_(stackFull)(stack)) |
4514 | 44 | TY_(growStack)(stack); |
4515 | | |
4516 | 1.68M | if (node) |
4517 | 123k | stack->firstNode[++stack->top] = node; |
4518 | 1.68M | } |
4519 | | |
4520 | | |
4521 | | /** |
4522 | | * Pop an item from the stack. |
4523 | | */ |
4524 | | Node* TY_(pop)(Stack *stack) |
4525 | 140k | { |
4526 | 140k | return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; |
4527 | 140k | } |
4528 | | |
4529 | | |
4530 | | /** |
4531 | | * Peek at the stack. |
4532 | | */ |
4533 | | FUNC_UNUSED Node* TY_(peek)(Stack *stack) |
4534 | 0 | { |
4535 | 0 | return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--]; |
4536 | 0 | } |
4537 | | |
4538 | | /** |
4539 | | * Frees the stack when done. |
4540 | | */ |
4541 | | void TY_(freeStack)(Stack *stack) |
4542 | 17.0k | { |
4543 | 17.0k | TidyFree( stack->allocator, stack->firstNode ); |
4544 | 17.0k | stack->top = -1; |
4545 | 17.0k | stack->capacity = 0; |
4546 | 17.0k | stack->firstNode = NULL; |
4547 | 17.0k | stack->allocator = NULL; |
4548 | 17.0k | } |