/src/clamav/libclamav/htmlnorm.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (C) 2013-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved. |
3 | | * Copyright (C) 2007-2013 Sourcefire, Inc. |
4 | | * |
5 | | * Authors: Trog |
6 | | * |
7 | | * Summary: Normalise HTML text. Decode MS Script Encoder protection. |
8 | | * The ScrEnc decoder was initially based upon an analysis by Andreas Marx. |
9 | | * |
10 | | * This program is free software; you can redistribute it and/or modify |
11 | | * it under the terms of the GNU General Public License version 2 as |
12 | | * published by the Free Software Foundation. |
13 | | * |
14 | | * This program is distributed in the hope that it will be useful, |
15 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
17 | | * GNU General Public License for more details. |
18 | | * |
19 | | * You should have received a copy of the GNU General Public License |
20 | | * along with this program; if not, write to the Free Software |
21 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, |
22 | | * MA 02110-1301, USA. |
23 | | */ |
24 | | |
25 | | #if HAVE_CONFIG_H |
26 | | #include "clamav-config.h" |
27 | | #endif |
28 | | |
29 | | #include <stdio.h> |
30 | | #ifdef HAVE_UNISTD_H |
31 | | #include <unistd.h> |
32 | | #endif |
33 | | #include <sys/types.h> |
34 | | #include <sys/stat.h> |
35 | | #include <fcntl.h> |
36 | | #ifdef HAVE_STRINGS_H |
37 | | #include <strings.h> |
38 | | #endif |
39 | | #include <string.h> |
40 | | #include <errno.h> |
41 | | #include <stdio.h> |
42 | | #include <ctype.h> |
43 | | |
44 | | #include "clamav.h" |
45 | | #include "fmap.h" |
46 | | #include "others.h" |
47 | | #include "htmlnorm.h" |
48 | | |
49 | | #include "entconv.h" |
50 | | #include "jsparse/js-norm.h" |
51 | | |
52 | | #include "clamav_rust.h" |
53 | | #include "scanners.h" |
54 | | |
55 | 328M | #define HTML_STR_LENGTH 1024 |
56 | 0 | #define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH |
57 | | |
58 | | typedef enum { |
59 | | HTML_BAD_STATE, |
60 | | HTML_NORM, |
61 | | HTML_8BIT, |
62 | | HTML_COMMENT, |
63 | | HTML_CHAR_REF, |
64 | | HTML_ENTITY_REF_DECODE, |
65 | | HTML_SKIP_WS, |
66 | | HTML_TRIM_WS, |
67 | | HTML_TAG, |
68 | | HTML_TAG_ARG, |
69 | | HTML_TAG_ARG_VAL, |
70 | | HTML_TAG_ARG_EQUAL, |
71 | | HTML_PROCESS_TAG, |
72 | | HTML_CHAR_REF_DECODE, |
73 | | HTML_LOOKFOR_SCRENC, |
74 | | HTML_JSDECODE, |
75 | | HTML_JSDECODE_LENGTH, |
76 | | HTML_JSDECODE_DECRYPT, |
77 | | HTML_SPECIAL_CHAR, |
78 | | HTML_RFC2397_TYPE, |
79 | | HTML_RFC2397_INIT, |
80 | | HTML_RFC2397_DATA, |
81 | | HTML_RFC2397_FINISH, |
82 | | HTML_RFC2397_ESC, |
83 | | HTML_ESCAPE_CHAR |
84 | | } html_state; |
85 | | |
86 | | typedef enum { |
87 | | TAG_DONT_EXTRACT, |
88 | | TAG_SCRIPT, |
89 | | TAG_STYLE, |
90 | | } tag_type; |
91 | | |
92 | | typedef enum { |
93 | | SINGLE_QUOTED, |
94 | | DOUBLE_QUOTED, |
95 | | NOT_QUOTED |
96 | | } quoted_state; |
97 | | |
98 | 3.30G | #define HTML_FILE_BUFF_LEN 8192 |
99 | | |
100 | | typedef struct file_buff_tag { |
101 | | int fd; |
102 | | unsigned char buffer[HTML_FILE_BUFF_LEN]; |
103 | | uint64_t length; |
104 | | } file_buff_t; |
105 | | |
106 | | struct tag_contents { |
107 | | size_t pos; |
108 | | unsigned char contents[MAX_TAG_CONTENTS_LENGTH + 1]; |
109 | | }; |
110 | | |
111 | | // clang-format off |
112 | | static const int64_t base64_chars[256] = { |
113 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
114 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
115 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, |
116 | | 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1, |
117 | | -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, |
118 | | 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, |
119 | | -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, |
120 | | 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1, |
121 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
122 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
123 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
124 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
125 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
126 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
127 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
128 | | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
129 | | }; |
130 | | // clang-format on |
131 | | |
132 | | int table_order[] = { |
133 | | 00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01, |
134 | | 00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02, |
135 | | 00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02, |
136 | | 00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02}; |
137 | | |
138 | | int decrypt_tables[3][128] = { |
139 | | {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
140 | | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, |
141 | | 0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43, |
142 | | 0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E, |
143 | | 0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74, |
144 | | 0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F, |
145 | | 0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46, |
146 | | 0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36}, |
147 | | |
148 | | {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
149 | | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, |
150 | | 0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73, |
151 | | 0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53, |
152 | | 0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D, |
153 | | 0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B, |
154 | | 0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67, |
155 | | 0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57}, |
156 | | |
157 | | {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F, |
158 | | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, |
159 | | 0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F, |
160 | | 0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E, |
161 | | 0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39, |
162 | | 0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48, |
163 | | 0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58, |
164 | | 0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}}; |
165 | | |
166 | | static inline unsigned int rewind_tospace(const unsigned char *chunk, unsigned int len) |
167 | 318k | { |
168 | 318k | unsigned int count = len; |
169 | 669M | while (!isspace(chunk[len - 1]) && (len > 1)) { |
170 | 669M | len--; |
171 | 669M | } |
172 | 318k | if (len == 1) { |
173 | 57.9k | return count; |
174 | 57.9k | } |
175 | 260k | return len; |
176 | 318k | } |
177 | | |
178 | | /* read at most @max_len of data from @m_area or @stream, skipping NULL chars. |
179 | | * This used to be called cli_readline, but we don't stop at end-of-line anymore */ |
180 | | static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len) |
181 | 1.53M | { |
182 | 1.53M | unsigned char *chunk, *start, *ptr, *end; |
183 | 1.53M | unsigned int chunk_len, count; |
184 | | |
185 | 1.53M | chunk = (unsigned char *)cli_max_malloc(max_len); |
186 | 1.53M | if (!chunk) { |
187 | 0 | cli_errmsg("readchunk: Unable to allocate memory for chunk\n"); |
188 | 0 | return NULL; |
189 | 0 | } |
190 | | |
191 | | /* Try to use the memory buffer first */ |
192 | 1.53M | if (m_area) { |
193 | | /* maximum we can copy into the buffer, |
194 | | * we could have less than max_len bytes available */ |
195 | 1.53M | chunk_len = MIN(m_area->length - m_area->offset, max_len - 1); |
196 | 1.53M | if (!chunk_len) { |
197 | 580k | free(chunk); |
198 | 580k | return NULL; |
199 | 580k | } |
200 | 954k | if (m_area->map) |
201 | 954k | ptr = (unsigned char *)fmap_need_off_once(m_area->map, m_area->offset, chunk_len); |
202 | 0 | else |
203 | 0 | ptr = m_area->buffer + m_area->offset; |
204 | 954k | start = ptr; |
205 | 954k | end = ptr - m_area->offset + m_area->length; |
206 | | |
207 | 954k | if ((start >= end) || !start) { |
208 | 0 | free(chunk); |
209 | 0 | return NULL; |
210 | 0 | } |
211 | | |
212 | | /* look for NULL chars */ |
213 | 954k | ptr = memchr(start, 0, chunk_len); |
214 | 954k | if (!ptr) { |
215 | | /* no NULL chars found, copy all */ |
216 | 295k | memcpy(chunk, start, chunk_len); |
217 | 295k | chunk[chunk_len] = '\0'; |
218 | 295k | m_area->offset += chunk_len; |
219 | | /* point ptr to end of chunk, |
220 | | * so we can check and rewind to a space below */ |
221 | 295k | ptr = start + chunk_len; |
222 | 658k | } else { |
223 | | /* copy portion that doesn't contain NULL chars */ |
224 | 658k | chunk_len = ptr - start; |
225 | 658k | if (chunk_len < max_len) { |
226 | 658k | memcpy(chunk, start, chunk_len); |
227 | 658k | } else { |
228 | 0 | chunk_len = 0; |
229 | 0 | ptr = start; |
230 | 0 | } |
231 | 658k | if (m_area->map) |
232 | 658k | ptr = (unsigned char *)fmap_need_ptr_once(m_area->map, ptr, end - ptr); |
233 | 658k | if (!ptr) { |
234 | 0 | cli_warnmsg("fmap inconsistency\n"); |
235 | 0 | ptr = end; |
236 | 0 | } |
237 | | /* we have unknown number of NULL chars, |
238 | | * copy char-by-char and skip them */ |
239 | 2.78G | while ((ptr < end) && (chunk_len < max_len - 1)) { |
240 | 2.78G | const unsigned char c = *ptr++; |
241 | | /* we can't use chunk_len to determine how many bytes we read, since |
242 | | * we skipped chars */ |
243 | 2.78G | if (c) { |
244 | 2.29G | chunk[chunk_len++] = c; |
245 | 2.29G | } |
246 | 2.78G | } |
247 | 658k | m_area->offset += ptr - start; |
248 | 658k | chunk[chunk_len] = '\0'; |
249 | 658k | } |
250 | 954k | if (ptr && ptr < end && !isspace(*ptr)) { |
251 | | /* we hit max_len, rewind to a space */ |
252 | 318k | count = rewind_tospace(chunk, chunk_len); |
253 | 318k | if (count < chunk_len) { |
254 | 250k | chunk[count] = '\0'; |
255 | 250k | m_area->offset -= chunk_len - count; |
256 | 250k | } |
257 | 318k | } |
258 | 954k | } else { |
259 | 0 | if (!stream) { |
260 | 0 | cli_dbgmsg("No HTML stream\n"); |
261 | 0 | free(chunk); |
262 | 0 | return NULL; |
263 | 0 | } |
264 | 0 | chunk_len = fread(chunk, 1, max_len - 1, stream); |
265 | 0 | if (!chunk_len || chunk_len > max_len - 1) { |
266 | | /* EOF, or prevent overflow */ |
267 | 0 | free(chunk); |
268 | 0 | return NULL; |
269 | 0 | } |
270 | | |
271 | | /* Look for NULL chars */ |
272 | 0 | ptr = memchr(chunk, 0, chunk_len); |
273 | 0 | if (ptr) { |
274 | | /* NULL char found */ |
275 | | /* save buffer limits */ |
276 | 0 | start = ptr; |
277 | 0 | end = chunk + chunk_len; |
278 | | |
279 | | /* start of NULL chars, we will copy non-NULL characters |
280 | | * to this position */ |
281 | 0 | chunk_len = ptr - chunk; |
282 | | |
283 | | /* find first non-NULL char */ |
284 | 0 | while ((ptr < end) && !(*ptr)) { |
285 | 0 | ptr++; |
286 | 0 | } |
287 | | /* skip over NULL chars, and move back the rest */ |
288 | 0 | while ((ptr < end) && (chunk_len < max_len - 1)) { |
289 | 0 | const unsigned char c = *ptr++; |
290 | 0 | if (c) { |
291 | 0 | chunk[chunk_len++] = c; |
292 | 0 | } |
293 | 0 | } |
294 | 0 | } |
295 | 0 | chunk[chunk_len] = '\0'; |
296 | 0 | if (chunk_len == max_len - 1) { |
297 | | /* rewind to a space (which includes newline) */ |
298 | 0 | count = rewind_tospace(chunk, chunk_len); |
299 | 0 | if (count < chunk_len) { |
300 | 0 | chunk[count] = '\0'; |
301 | | /* seek-back to space */ |
302 | 0 | fseek(stream, -(long)(chunk_len - count), SEEK_CUR); |
303 | 0 | } |
304 | 0 | } |
305 | 0 | } |
306 | | |
307 | 954k | return chunk; |
308 | 1.53M | } |
309 | | |
310 | | static void html_output_flush(file_buff_t *fbuff) |
311 | 930k | { |
312 | 930k | if (fbuff && (fbuff->length > 0)) { |
313 | 930k | cli_writen(fbuff->fd, fbuff->buffer, fbuff->length); |
314 | 930k | fbuff->length = 0; |
315 | 930k | } |
316 | 930k | } |
317 | | |
318 | | static inline void html_output_c(file_buff_t *fbuff1, unsigned char c) |
319 | 3.30G | { |
320 | 3.30G | if (fbuff1) { |
321 | 3.30G | if (fbuff1->length == HTML_FILE_BUFF_LEN) { |
322 | 301k | html_output_flush(fbuff1); |
323 | 301k | } |
324 | 3.30G | fbuff1->buffer[fbuff1->length++] = c; |
325 | 3.30G | } |
326 | 3.30G | } |
327 | | |
328 | | static void html_output_str(file_buff_t *fbuff, const unsigned char *str, size_t len) |
329 | 1.24M | { |
330 | 1.24M | if (fbuff) { |
331 | 1.24M | if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) { |
332 | 2.49k | html_output_flush(fbuff); |
333 | 2.49k | } |
334 | 1.24M | if (len >= HTML_FILE_BUFF_LEN) { |
335 | 0 | html_output_flush(fbuff); |
336 | 0 | cli_writen(fbuff->fd, str, len); |
337 | 1.24M | } else { |
338 | 1.24M | memcpy(fbuff->buffer + fbuff->length, str, len); |
339 | 1.24M | fbuff->length += len; |
340 | 1.24M | } |
341 | 1.24M | } |
342 | 1.24M | } |
343 | | |
344 | | static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag) |
345 | 1.87M | { |
346 | 1.87M | int i; |
347 | | |
348 | 3.80M | for (i = 0; i < tags->count; i++) { |
349 | 3.01M | if (strcmp((const char *)tags->tag[i], tag) == 0) { |
350 | 1.08M | return (char *)tags->value[i]; |
351 | 1.08M | } |
352 | 3.01M | } |
353 | 793k | return NULL; |
354 | 1.87M | } |
355 | | |
356 | | static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value) |
357 | 14.7k | { |
358 | 14.7k | int i; |
359 | | |
360 | 104k | for (i = 0; i < tags->count; i++) { |
361 | 104k | if (strcmp((const char *)tags->tag[i], tag) == 0) { |
362 | 14.7k | free(tags->value[i]); |
363 | 14.7k | tags->value[i] = (unsigned char *)cli_safer_strdup(value); |
364 | 14.7k | return; |
365 | 14.7k | } |
366 | 104k | } |
367 | 0 | return; |
368 | 14.7k | } |
369 | | void html_tag_arg_add(tag_arguments_t *tags, |
370 | | const char *tag, char *value) |
371 | 7.83M | { |
372 | 7.83M | int len, i; |
373 | 7.83M | tags->count++; |
374 | 7.83M | tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag, |
375 | 7.83M | tags->count * sizeof(char *)); |
376 | 7.83M | if (!tags->tag) { |
377 | 0 | goto done; |
378 | 0 | } |
379 | 7.83M | tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value, |
380 | 7.83M | tags->count * sizeof(char *)); |
381 | 7.83M | if (!tags->value) { |
382 | 0 | goto done; |
383 | 0 | } |
384 | 7.83M | if (tags->scanContents) { |
385 | 0 | tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents, |
386 | 0 | tags->count * sizeof(*tags->contents)); |
387 | 0 | if (!tags->contents) { |
388 | 0 | goto done; |
389 | 0 | } |
390 | 0 | tags->contents[tags->count - 1] = NULL; |
391 | 0 | } |
392 | 7.83M | tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag); |
393 | 7.83M | if (value) { |
394 | 2.39M | if (*value == '"') { |
395 | 672k | tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1); |
396 | 672k | len = strlen((const char *)value + 1); |
397 | 672k | if (len > 0) { |
398 | 668k | tags->value[tags->count - 1][len - 1] = '\0'; |
399 | 668k | } |
400 | 1.72M | } else { |
401 | 1.72M | tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value); |
402 | 1.72M | } |
403 | 5.44M | } else { |
404 | 5.44M | tags->value[tags->count - 1] = NULL; |
405 | 5.44M | } |
406 | 7.83M | return; |
407 | | |
408 | 0 | done: |
409 | | /* Bad error - can't do 100% recovery */ |
410 | 0 | tags->count--; |
411 | 0 | for (i = 0; i < tags->count; i++) { |
412 | 0 | if (tags->tag) { |
413 | 0 | free(tags->tag[i]); |
414 | 0 | } |
415 | 0 | if (tags->value) { |
416 | 0 | free(tags->value[i]); |
417 | 0 | } |
418 | 0 | if (tags->contents) { |
419 | 0 | if (tags->contents[i]) |
420 | 0 | free(tags->contents[i]); |
421 | 0 | } |
422 | 0 | } |
423 | 0 | if (tags->tag) { |
424 | 0 | free(tags->tag); |
425 | 0 | } |
426 | 0 | if (tags->value) { |
427 | 0 | free(tags->value); |
428 | 0 | } |
429 | 0 | if (tags->contents) |
430 | 0 | free(tags->contents); |
431 | 0 | tags->contents = NULL; |
432 | 0 | tags->tag = tags->value = NULL; |
433 | 0 | tags->count = 0; |
434 | 0 | return; |
435 | 7.83M | } |
436 | | |
437 | | static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags) |
438 | 14.7k | { |
439 | 14.7k | int i, j, len; |
440 | | |
441 | 14.7k | html_output_c(fbuff, '<'); |
442 | 14.7k | html_output_str(fbuff, (const unsigned char *)tag, strlen(tag)); |
443 | 124k | for (i = 0; i < tags->count; i++) { |
444 | 110k | html_output_c(fbuff, ' '); |
445 | 110k | html_output_str(fbuff, tags->tag[i], strlen((const char *)tags->tag[i])); |
446 | 110k | if (tags->value[i]) { |
447 | 41.8k | html_output_str(fbuff, (const unsigned char *)"=\"", 2); |
448 | 41.8k | len = strlen((const char *)tags->value[i]); |
449 | 773k | for (j = 0; j < len; j++) { |
450 | 731k | html_output_c(fbuff, tolower(tags->value[i][j])); |
451 | 731k | } |
452 | 41.8k | html_output_c(fbuff, '"'); |
453 | 41.8k | } |
454 | 110k | } |
455 | 14.7k | html_output_c(fbuff, '>'); |
456 | 14.7k | } |
457 | | |
458 | | void html_tag_arg_free(tag_arguments_t *tags) |
459 | 9.86M | { |
460 | 9.86M | int i; |
461 | | |
462 | 17.7M | for (i = 0; i < tags->count; i++) { |
463 | 7.83M | free(tags->tag[i]); |
464 | 7.83M | if (tags->value[i]) { |
465 | 2.39M | free(tags->value[i]); |
466 | 2.39M | } |
467 | 7.83M | if (tags->contents) |
468 | 0 | if (tags->contents[i]) |
469 | 0 | free(tags->contents[i]); |
470 | 7.83M | } |
471 | 9.86M | if (tags->tag) { |
472 | 2.52M | free(tags->tag); |
473 | 2.52M | } |
474 | 9.86M | if (tags->value) { |
475 | 2.52M | free(tags->value); |
476 | 2.52M | } |
477 | 9.86M | if (tags->contents) |
478 | 0 | free(tags->contents); |
479 | 9.86M | tags->contents = NULL; |
480 | 9.86M | tags->tag = tags->value = NULL; |
481 | 9.86M | tags->count = 0; |
482 | 9.86M | } |
483 | | |
484 | | /** |
485 | | * the displayed text for an <a href> tag |
486 | | */ |
487 | | static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char *begin, const unsigned char *end) |
488 | 0 | { |
489 | 0 | size_t i; |
490 | 0 | uint32_t mbchar = 0; |
491 | 0 | if (!begin || !end) |
492 | 0 | return; |
493 | 0 | for (i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end); i++) { |
494 | 0 | uint8_t c = *begin++; |
495 | 0 | if (mbchar && (c < 0x80 || mbchar >= 0x10000)) { |
496 | 0 | if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 || |
497 | 0 | (mbchar == 0xA1 && (c == 0x43 || c == 0x44 || c == 0x4F))) { |
498 | 0 | cont->contents[i++] = '.'; |
499 | 0 | if (mbchar == 0xA1) { |
500 | 0 | --i; |
501 | 0 | mbchar = 0; |
502 | 0 | continue; |
503 | 0 | } |
504 | 0 | } else { |
505 | 0 | uint8_t c0 = mbchar >> 16; |
506 | 0 | uint8_t c1 = (mbchar >> 8) & 0xff; |
507 | 0 | uint8_t c2 = (mbchar & 0xff); |
508 | 0 | if (c0 && i + 1 < MAX_TAG_CONTENTS_LENGTH) |
509 | 0 | cont->contents[i++] = c0; |
510 | 0 | if ((c0 || c1) && i + 1 < MAX_TAG_CONTENTS_LENGTH) |
511 | 0 | cont->contents[i++] = c1; |
512 | 0 | if (i + 1 < MAX_TAG_CONTENTS_LENGTH) |
513 | 0 | cont->contents[i++] = c2; |
514 | 0 | } |
515 | 0 | mbchar = 0; |
516 | 0 | } |
517 | 0 | if (c >= 0x80) { |
518 | 0 | mbchar = (mbchar << 8) | c; |
519 | 0 | --i; |
520 | 0 | } else |
521 | 0 | cont->contents[i] = c; |
522 | 0 | } |
523 | 0 | cont->pos = i; |
524 | 0 | } |
525 | | |
526 | | static inline void html_tag_contents_done(tag_arguments_t *tags, int idx, struct tag_contents *cont) |
527 | 0 | { |
528 | 0 | unsigned char *p; |
529 | 0 | cont->contents[cont->pos++] = '\0'; |
530 | 0 | p = cli_max_malloc(cont->pos); |
531 | 0 | if (!p) { |
532 | 0 | cli_errmsg("html_tag_contents_done: Unable to allocate memory for p\n"); |
533 | 0 | return; |
534 | 0 | } |
535 | 0 | memcpy(p, cont->contents, cont->pos); |
536 | 0 | tags->contents[idx - 1] = p; |
537 | 0 | cont->pos = 0; |
538 | 0 | } |
539 | | |
540 | | struct screnc_state { |
541 | | uint32_t length; |
542 | | uint32_t sum; |
543 | | uint8_t table_pos; |
544 | | }; |
545 | | |
546 | | /* inplace decoding, so that we can normalize it later */ |
547 | | static void screnc_decode(unsigned char *ptr, struct screnc_state *s) |
548 | 688k | { |
549 | 688k | uint8_t value; |
550 | 688k | unsigned char *dst = ptr; |
551 | | |
552 | 688k | if (!ptr || !s) |
553 | 0 | return; |
554 | 1.11G | while (s->length > 0 && *ptr) { |
555 | 1.11G | if ((*ptr == '\n') || (*ptr == '\r')) { |
556 | 10.2M | ptr++; |
557 | 10.2M | continue; |
558 | 10.2M | } |
559 | 1.10G | if (*ptr < 0x80) { |
560 | 736M | value = decrypt_tables[table_order[s->table_pos]][*ptr]; |
561 | 736M | if (value == 0xFF) { /* special character */ |
562 | 6.71M | ptr++; |
563 | 6.71M | s->length--; |
564 | 6.71M | switch (*ptr) { |
565 | 8.63k | case '\0': |
566 | | /* Fixup for end of line */ |
567 | 8.63k | ptr--; |
568 | 8.63k | break; |
569 | 61.0k | case 0x21: |
570 | 61.0k | value = 0x3c; |
571 | 61.0k | break; |
572 | 165k | case 0x23: |
573 | 165k | value = 0x0d; |
574 | 165k | break; |
575 | 61.0k | case 0x24: |
576 | 61.0k | value = 0x40; |
577 | 61.0k | break; |
578 | 45.4k | case 0x26: |
579 | 45.4k | value = 0x0a; |
580 | 45.4k | break; |
581 | 40.1k | case 0x2a: |
582 | 40.1k | value = 0x3e; |
583 | 40.1k | break; |
584 | 6.71M | } |
585 | 6.71M | } |
586 | 736M | s->sum += value; |
587 | 736M | *dst++ = value; |
588 | 736M | s->table_pos = (s->table_pos + 1) % 64; |
589 | 736M | } else { |
590 | 365M | *dst++ = *ptr++; |
591 | 365M | *dst++ = *ptr; |
592 | 365M | if (!*ptr) { |
593 | 81.3k | dst--; |
594 | 81.3k | break; |
595 | 81.3k | } |
596 | 365M | } |
597 | 1.10G | ptr++; |
598 | 1.10G | s->length--; |
599 | 1.10G | } |
600 | 688k | if (!s->length) { |
601 | 238k | size_t remaining; |
602 | 238k | if (strlen((const char *)ptr) >= 12) { |
603 | 232k | uint64_t expected; |
604 | 232k | expected = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2; |
605 | 232k | expected += base64_chars[ptr[1]] >> 4; |
606 | 232k | expected += (base64_chars[ptr[1]] & 0x0f) << 12; |
607 | 232k | expected += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8; |
608 | 232k | expected += (base64_chars[ptr[2]] & 0x03) << 22; |
609 | 232k | expected += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16; |
610 | 232k | expected += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24; |
611 | 232k | expected += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24; |
612 | 232k | ptr += 8; |
613 | 232k | if (s->sum != expected) { |
614 | 226k | cli_dbgmsg("screnc_decode: checksum mismatch: %u != %" PRIu64 "\n", s->sum, expected); |
615 | 226k | } else { |
616 | 5.73k | if (strncmp((const char *)ptr, "^#~@", 4) != 0) { |
617 | 4.59k | cli_dbgmsg("screnc_decode: terminator not found\n"); |
618 | 4.59k | } else { |
619 | 1.14k | cli_dbgmsg("screnc_decode: OK\n"); |
620 | 1.14k | } |
621 | 5.73k | } |
622 | 232k | ptr += 4; |
623 | 232k | } |
624 | | /* copy remaining */ |
625 | 238k | remaining = strlen((const char *)ptr) + 1; |
626 | 238k | memmove(dst, ptr, remaining); |
627 | 450k | } else { |
628 | 450k | *dst = '\0'; |
629 | 450k | } |
630 | 688k | } |
631 | | |
632 | | static void js_process(struct parser_state *js_state, const unsigned char *js_begin, const unsigned char *js_end, |
633 | | const unsigned char *line, const unsigned char *ptr, tag_type in_tag, const char *dirname) |
634 | 640k | { |
635 | 640k | if (!js_begin) |
636 | 275k | js_begin = line; |
637 | 640k | if (!js_end) |
638 | 455k | js_end = ptr; |
639 | 640k | if (js_end > js_begin && |
640 | 640k | CLI_ISCONTAINED(line, 8192, js_begin, 1) && |
641 | 640k | CLI_ISCONTAINED(line, 8192, js_end, 1)) { |
642 | 639k | cli_js_process_buffer(js_state, (const char *)js_begin, js_end - js_begin); |
643 | 639k | } |
644 | 640k | if (in_tag == TAG_DONT_EXTRACT) { |
645 | | /* we found a /script, normalize script now */ |
646 | 184k | cli_js_parse_done(js_state); |
647 | 184k | cli_js_output(js_state, dirname); |
648 | 184k | cli_js_destroy(js_state); |
649 | 184k | } |
650 | 640k | } |
651 | | |
652 | | static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) |
653 | 259k | { |
654 | 259k | int fd_tmp, tag_length = 0, tag_arg_length = 0; |
655 | 259k | bool binary, retval = false, escape = false, hex = false; |
656 | 259k | int64_t value = 0, tag_val_length = 0; |
657 | 259k | bool look_for_screnc = false, in_screnc = false, text_space_written = false; |
658 | 259k | tag_type in_tag = TAG_DONT_EXTRACT; |
659 | 259k | FILE *stream_in = NULL; |
660 | 259k | html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE; |
661 | 259k | char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1]; |
662 | 259k | char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value; |
663 | 259k | unsigned char *line = NULL, *ptr, *ptr_screnc = NULL; |
664 | 259k | tag_arguments_t tag_args; |
665 | 259k | quoted_state quoted = NOT_QUOTED; |
666 | 259k | unsigned long length = 0; |
667 | 259k | struct screnc_state screnc_state; |
668 | 259k | file_buff_t *file_buff_o2, *file_buff_text; |
669 | 259k | file_buff_t *file_tmp_o1 = NULL; |
670 | 259k | int in_ahref = 0; /* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/ |
671 | 259k | unsigned char *href_contents_begin = NULL; /*beginning of the next portion of <a> contents*/ |
672 | 259k | unsigned char *ptrend = NULL; /*end of <a> contents*/ |
673 | 259k | unsigned char *in_form_action = NULL; /* the action URL of the current <form> tag, if any*/ |
674 | | |
675 | 259k | struct entity_conv conv; |
676 | 259k | unsigned char entity_val[HTML_STR_LENGTH + 1]; |
677 | 259k | size_t entity_val_length = 0; |
678 | 259k | const int dconf_entconv = dconf ? dconf->phishing & PHISHING_CONF_ENTCONV : 1; |
679 | 259k | const int dconf_js = dirname && (dconf ? dconf->doc & DOC_CONF_JSNORM : 1); /* TODO */ |
680 | | /* dconf for phishing engine sets scanContents, so no need for a flag here */ |
681 | 259k | struct parser_state *js_state = NULL; |
682 | 259k | const unsigned char *js_begin = NULL, *js_end = NULL; |
683 | 259k | uint8_t *style_buff = NULL; |
684 | 259k | size_t style_buff_size = 0; |
685 | 259k | const unsigned char *style_begin = NULL, *style_end = NULL; |
686 | 259k | struct tag_contents contents; |
687 | 259k | uint32_t mbchar = 0; |
688 | 259k | uint32_t mbchar2 = 0; |
689 | | |
690 | | /* |
691 | | * Initialize stack buffers. |
692 | | */ |
693 | 259k | memset(filename, 0, sizeof(filename)); |
694 | 259k | memset(tag, 0, sizeof(tag)); |
695 | 259k | memset(tag_arg, 0, sizeof(tag_arg)); |
696 | 259k | memset(tag_val, 0, sizeof(tag_val)); |
697 | 259k | memset(entity_val, 0, sizeof(entity_val)); |
698 | | |
699 | 259k | tag_args.scanContents = 0; /* do we need to store the contents of <a></a>?*/ |
700 | 259k | contents.pos = 0; |
701 | 259k | if (!m_area) { |
702 | 0 | if (fd < 0) { |
703 | 0 | cli_dbgmsg("Invalid HTML fd\n"); |
704 | 0 | return false; |
705 | 0 | } |
706 | 0 | lseek(fd, 0, SEEK_SET); |
707 | 0 | fd_tmp = dup(fd); |
708 | 0 | if (fd_tmp < 0) { |
709 | 0 | return false; |
710 | 0 | } |
711 | 0 | stream_in = fdopen(fd_tmp, "r"); |
712 | 0 | if (!stream_in) { |
713 | 0 | close(fd_tmp); |
714 | 0 | return false; |
715 | 0 | } |
716 | 0 | } |
717 | | |
718 | 259k | tag_args.count = 0; |
719 | 259k | tag_args.tag = NULL; |
720 | 259k | tag_args.value = NULL; |
721 | 259k | tag_args.contents = NULL; |
722 | 259k | if (dirname) { |
723 | 259k | file_buff_o2 = (file_buff_t *)malloc(sizeof(file_buff_t)); |
724 | 259k | if (!file_buff_o2) { |
725 | 0 | cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_o2\n"); |
726 | 0 | file_buff_o2 = file_buff_text = NULL; |
727 | 0 | goto done; |
728 | 0 | } |
729 | | |
730 | | /* this will still contains scripts that are inside comments */ |
731 | 259k | snprintf(filename, 1024, "%s" PATHSEP "nocomment.html", dirname); |
732 | 259k | file_buff_o2->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR); |
733 | 259k | if (file_buff_o2->fd == -1) { |
734 | 0 | cli_dbgmsg("open failed: %s\n", filename); |
735 | 0 | free(file_buff_o2); |
736 | 0 | file_buff_o2 = file_buff_text = NULL; |
737 | 0 | goto done; |
738 | 0 | } |
739 | | |
740 | 259k | file_buff_text = (file_buff_t *)malloc(sizeof(file_buff_t)); |
741 | 259k | if (!file_buff_text) { |
742 | 0 | close(file_buff_o2->fd); |
743 | 0 | free(file_buff_o2); |
744 | 0 | file_buff_o2 = file_buff_text = NULL; |
745 | 0 | cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_text\n"); |
746 | 0 | goto done; |
747 | 0 | } |
748 | | |
749 | 259k | snprintf(filename, 1024, "%s" PATHSEP "notags.html", dirname); |
750 | 259k | file_buff_text->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR); |
751 | 259k | if (file_buff_text->fd == -1) { |
752 | 0 | cli_dbgmsg("open failed: %s\n", filename); |
753 | 0 | close(file_buff_o2->fd); |
754 | 0 | free(file_buff_o2); |
755 | 0 | free(file_buff_text); |
756 | 0 | file_buff_o2 = file_buff_text = NULL; |
757 | 0 | goto done; |
758 | 0 | } |
759 | 259k | file_buff_o2->length = 0; |
760 | 259k | file_buff_text->length = 0; |
761 | 259k | } else { |
762 | 0 | file_buff_o2 = NULL; |
763 | 0 | file_buff_text = NULL; |
764 | 0 | } |
765 | | |
766 | 259k | binary = false; |
767 | | |
768 | 259k | ptr = line = cli_readchunk(stream_in, m_area, 8192); |
769 | | |
770 | 959k | while (line) { |
771 | 700k | if (href_contents_begin) |
772 | 0 | href_contents_begin = ptr; /*start of a new line, last line already appended to contents see below*/ |
773 | 1.03M | while (*ptr && isspace(*ptr)) { |
774 | 337k | ptr++; |
775 | 337k | } |
776 | 3.20G | while (*ptr) { |
777 | 3.20G | if (!binary && *ptr == '\n') { |
778 | | /* Convert it to a space and re-process */ |
779 | 14.7M | *ptr = ' '; |
780 | 14.7M | continue; |
781 | 14.7M | } |
782 | 3.19G | if (!binary && *ptr == '\r') { |
783 | 5.27M | ptr++; |
784 | 5.27M | continue; |
785 | 5.27M | } |
786 | 3.18G | switch (state) { |
787 | 0 | case HTML_SPECIAL_CHAR: |
788 | 0 | cli_dbgmsg("Impossible, special_char can't occur here\n"); |
789 | 0 | break; |
790 | 0 | case HTML_BAD_STATE: |
791 | | /* An engine error has occurred */ |
792 | 0 | cli_dbgmsg("HTML Engine Error\n"); |
793 | 0 | goto done; |
794 | 81.7M | case HTML_SKIP_WS: |
795 | 81.7M | if (isspace(*ptr)) { |
796 | 15.0M | ptr++; |
797 | 66.6M | } else { |
798 | 66.6M | state = next_state; |
799 | 66.6M | next_state = HTML_BAD_STATE; |
800 | 66.6M | } |
801 | 81.7M | break; |
802 | 156M | case HTML_TRIM_WS: |
803 | 156M | if (isspace(*ptr)) { |
804 | 93.7M | ptr++; |
805 | 93.7M | } else { |
806 | 62.6M | if (in_tag == TAG_DONT_EXTRACT) { |
807 | 9.52M | html_output_c(file_buff_o2, ' '); |
808 | 9.52M | } |
809 | 62.6M | state = next_state; |
810 | 62.6M | next_state = HTML_BAD_STATE; |
811 | 62.6M | } |
812 | 156M | break; |
813 | 420M | case HTML_8BIT: |
814 | 420M | if (*ptr < 0x80 || mbchar >= 0x10000) { |
815 | 190M | if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 || |
816 | 190M | (mbchar == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) { |
817 | | /* bb #4097 */ |
818 | 481k | html_output_c(file_buff_o2, '.'); |
819 | 481k | html_output_c(file_buff_text, '.'); |
820 | 481k | if (mbchar == 0xA1) { |
821 | 330k | ptr++; |
822 | 330k | mbchar = 0; |
823 | 330k | continue; |
824 | 330k | } |
825 | 189M | } else { |
826 | 189M | uint8_t c0 = mbchar >> 16; |
827 | 189M | uint8_t c1 = (mbchar >> 8) & 0xff; |
828 | 189M | uint8_t c2 = (mbchar & 0xff); |
829 | 189M | if (c0) { |
830 | 101M | html_output_c(file_buff_o2, c0); |
831 | 101M | html_output_c(file_buff_text, c0); |
832 | 101M | } |
833 | 189M | if (c0 || c1) { |
834 | 127M | html_output_c(file_buff_o2, c1); |
835 | 127M | html_output_c(file_buff_text, c1); |
836 | 127M | } |
837 | 189M | html_output_c(file_buff_o2, c2); |
838 | 189M | html_output_c(file_buff_text, c1); |
839 | 189M | } |
840 | 190M | mbchar = 0; |
841 | 190M | state = next_state; |
842 | 190M | next_state = HTML_NORM; |
843 | 230M | } else { |
844 | 230M | mbchar = (mbchar << 8) | *ptr; |
845 | 230M | ptr++; |
846 | 230M | } |
847 | 420M | break; |
848 | 1.99G | case HTML_NORM: |
849 | 1.99G | if (*ptr == '<') { |
850 | 37.0M | ptrend = ptr; /* for use by scanContents */ |
851 | 37.0M | html_output_c(file_buff_o2, '<'); |
852 | 37.0M | if (in_tag == TAG_DONT_EXTRACT && !text_space_written) { |
853 | 3.61M | html_output_c(file_buff_text, ' '); |
854 | 3.61M | text_space_written = true; |
855 | 3.61M | } |
856 | 37.0M | if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin) { |
857 | | /*append this text portion to the contents of <a>*/ |
858 | 0 | html_tag_contents_append(&contents, href_contents_begin, ptr); |
859 | 0 | href_contents_begin = NULL; /*We just encountered another tag inside <a>, so skip it*/ |
860 | 0 | } |
861 | 37.0M | ptr++; |
862 | 37.0M | state = HTML_SKIP_WS; |
863 | 37.0M | tag_length = 0; |
864 | 37.0M | next_state = HTML_TAG; |
865 | 1.96G | } else if (isspace(*ptr)) { |
866 | 62.7M | if (!text_space_written && in_tag == TAG_DONT_EXTRACT) { |
867 | 8.42M | html_output_c(file_buff_text, ' '); |
868 | 8.42M | text_space_written = true; |
869 | 8.42M | } |
870 | 62.7M | state = HTML_TRIM_WS; |
871 | 62.7M | next_state = HTML_NORM; |
872 | 1.89G | } else if (*ptr == '&') { |
873 | 9.78M | if (!text_space_written && in_tag == TAG_DONT_EXTRACT) { |
874 | 773k | html_output_c(file_buff_text, ' '); |
875 | 773k | text_space_written = true; |
876 | 773k | } |
877 | 9.78M | state = HTML_CHAR_REF; |
878 | 9.78M | next_state = HTML_NORM; |
879 | 9.78M | ptr++; |
880 | 1.89G | } else if (*ptr >= 0x80) { |
881 | 190M | state = HTML_8BIT; |
882 | 190M | next_state = HTML_NORM; |
883 | 190M | mbchar = *ptr; |
884 | 190M | ptr++; |
885 | 1.69G | } else { |
886 | 1.69G | unsigned char c = tolower(*ptr); |
887 | | /* normalize ' to " for scripts */ |
888 | 1.69G | if (in_tag != TAG_DONT_EXTRACT && c == '\'') c = '"'; |
889 | 1.69G | html_output_c(file_buff_o2, c); |
890 | 1.69G | if (in_tag == TAG_DONT_EXTRACT) { |
891 | 238M | if (*ptr < 0x20) { |
892 | 23.2M | if (!text_space_written) { |
893 | 11.0M | html_output_c(file_buff_text, ' '); |
894 | 11.0M | text_space_written = true; |
895 | 11.0M | } |
896 | 215M | } else { |
897 | 215M | html_output_c(file_buff_text, c); |
898 | 215M | text_space_written = false; |
899 | 215M | } |
900 | 238M | } |
901 | 1.69G | ptr++; |
902 | 1.69G | } |
903 | 1.99G | break; |
904 | 152M | case HTML_TAG: |
905 | 152M | if ((tag_length == 0) && (*ptr == '!')) { |
906 | | /* Comment */ |
907 | 526k | if (in_tag != TAG_DONT_EXTRACT) { |
908 | | /* we still write scripts to nocomment.html */ |
909 | 474k | html_output_c(file_buff_o2, '!'); |
910 | 474k | } else { |
911 | | /* Need to rewind in the no-comment output stream */ |
912 | 52.3k | if (file_buff_o2 && (file_buff_o2->length > 0)) { |
913 | 52.3k | file_buff_o2->length--; |
914 | 52.3k | } |
915 | 52.3k | } |
916 | 526k | state = HTML_COMMENT; |
917 | 526k | next_state = HTML_BAD_STATE; |
918 | 526k | ptr++; |
919 | 152M | } else if (*ptr == '>') { |
920 | 7.06M | html_output_c(file_buff_o2, '>'); |
921 | 7.06M | ptr++; |
922 | 7.06M | tag[tag_length] = '\0'; |
923 | 7.06M | state = HTML_SKIP_WS; |
924 | 7.06M | next_state = HTML_PROCESS_TAG; |
925 | 144M | } else if (!isspace(*ptr)) { |
926 | 142M | html_output_c(file_buff_o2, tolower(*ptr)); |
927 | | /* if we're inside a script we only care for </script>.*/ |
928 | 142M | if (in_tag != TAG_DONT_EXTRACT && tag_length == 0 && *ptr != '/') { |
929 | 26.7M | state = HTML_NORM; |
930 | 26.7M | } |
931 | 142M | if (tag_length < HTML_STR_LENGTH) { |
932 | 139M | tag[tag_length++] = tolower(*ptr); |
933 | 139M | } |
934 | 142M | ptr++; |
935 | 142M | } else { |
936 | 2.69M | tag[tag_length] = '\0'; |
937 | 2.69M | state = HTML_SKIP_WS; |
938 | 2.69M | tag_arg_length = 0; |
939 | | /* if we'd go to HTML_TAG_ARG whitespace would be inconsistently normalized for in_tag*/ |
940 | 2.69M | next_state = in_tag == TAG_DONT_EXTRACT ? HTML_TAG_ARG : HTML_PROCESS_TAG; |
941 | 2.69M | } |
942 | 152M | break; |
943 | 119M | case HTML_TAG_ARG: |
944 | 119M | if (*ptr == '=') { |
945 | 2.39M | html_output_c(file_buff_o2, '='); |
946 | 2.39M | tag_arg[tag_arg_length] = '\0'; |
947 | 2.39M | ptr++; |
948 | 2.39M | state = HTML_SKIP_WS; |
949 | 2.39M | escape = false; |
950 | 2.39M | quoted = NOT_QUOTED; |
951 | 2.39M | tag_val_length = 0; |
952 | 2.39M | next_state = HTML_TAG_ARG_VAL; |
953 | 117M | } else if (isspace(*ptr)) { |
954 | 4.35M | ptr++; |
955 | 4.35M | tag_arg[tag_arg_length] = '\0'; |
956 | 4.35M | state = HTML_SKIP_WS; |
957 | 4.35M | next_state = HTML_TAG_ARG_EQUAL; |
958 | 113M | } else if (*ptr == '>') { |
959 | 2.50M | html_output_c(file_buff_o2, '>'); |
960 | 2.50M | if (tag_arg_length > 0) { |
961 | 1.24M | tag_arg[tag_arg_length] = '\0'; |
962 | 1.24M | html_tag_arg_add(&tag_args, tag_arg, NULL); |
963 | 1.24M | } |
964 | 2.50M | ptr++; |
965 | 2.50M | state = HTML_PROCESS_TAG; |
966 | 2.50M | next_state = HTML_BAD_STATE; |
967 | 110M | } else { |
968 | 110M | if (tag_arg_length == 0) { |
969 | | /* Start of new tag - add space */ |
970 | 7.82M | html_output_c(file_buff_o2, ' '); |
971 | 7.82M | } |
972 | 110M | html_output_c(file_buff_o2, tolower(*ptr)); |
973 | 110M | if (tag_arg_length < HTML_STR_LENGTH) { |
974 | 107M | tag_arg[tag_arg_length++] = tolower(*ptr); |
975 | 107M | } |
976 | 110M | ptr++; |
977 | 110M | } |
978 | 119M | break; |
979 | 4.34M | case HTML_TAG_ARG_EQUAL: |
980 | 4.34M | if (*ptr == '=') { |
981 | 151k | html_output_c(file_buff_o2, '='); |
982 | 151k | ptr++; |
983 | 151k | state = HTML_SKIP_WS; |
984 | 151k | escape = false; |
985 | 151k | quoted = NOT_QUOTED; |
986 | 151k | tag_val_length = 0; |
987 | 151k | next_state = HTML_TAG_ARG_VAL; |
988 | 4.19M | } else { |
989 | 4.19M | if (tag_arg_length > 0) { |
990 | 4.19M | tag_arg[tag_arg_length] = '\0'; |
991 | 4.19M | html_tag_arg_add(&tag_args, tag_arg, NULL); |
992 | 4.19M | } |
993 | 4.19M | tag_arg_length = 0; |
994 | 4.19M | state = HTML_TAG_ARG; |
995 | 4.19M | next_state = HTML_BAD_STATE; |
996 | 4.19M | } |
997 | 4.34M | break; |
998 | 59.0M | case HTML_TAG_ARG_VAL: |
999 | 59.0M | if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) { |
1000 | | /* RFC2397 inline data */ |
1001 | | |
1002 | | /* Rewind one byte so we don't recursive */ |
1003 | 21.6k | if (file_buff_o2 && (file_buff_o2->length > 0)) { |
1004 | 21.6k | file_buff_o2->length--; |
1005 | 21.6k | } |
1006 | | |
1007 | 21.6k | if (quoted != NOT_QUOTED) { |
1008 | 0 | html_output_c(file_buff_o2, '"'); |
1009 | 0 | } |
1010 | 21.6k | tag_val_length = 0; |
1011 | 21.6k | state = HTML_RFC2397_TYPE; |
1012 | 21.6k | next_state = HTML_TAG_ARG; |
1013 | 59.0M | } else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) { |
1014 | | /* RFC2397 inline data */ |
1015 | | |
1016 | | /* Rewind one byte so we don't recursive */ |
1017 | 117k | if (file_buff_o2 && (file_buff_o2->length > 0)) { |
1018 | 117k | file_buff_o2->length--; |
1019 | 117k | } |
1020 | | |
1021 | 117k | if (quoted != NOT_QUOTED) { |
1022 | 117k | html_output_c(file_buff_o2, '"'); |
1023 | 117k | } |
1024 | | |
1025 | 117k | tag_val_length = 0; |
1026 | 117k | state = HTML_RFC2397_TYPE; |
1027 | 117k | next_state = HTML_TAG_ARG; |
1028 | 58.9M | } else if (*ptr == '&') { |
1029 | 1.00M | state = HTML_CHAR_REF; |
1030 | 1.00M | next_state = HTML_TAG_ARG_VAL; |
1031 | 1.00M | ptr++; |
1032 | 57.9M | } else if (*ptr == '\'') { |
1033 | 766k | if (tag_val_length == 0) { |
1034 | 287k | quoted = SINGLE_QUOTED; |
1035 | 287k | html_output_c(file_buff_o2, '"'); |
1036 | 287k | if (tag_val_length < HTML_STR_LENGTH) { |
1037 | 287k | tag_val[tag_val_length++] = '"'; |
1038 | 287k | } |
1039 | 287k | ptr++; |
1040 | 479k | } else { |
1041 | 479k | if (!escape && (quoted == SINGLE_QUOTED)) { |
1042 | 121k | html_output_c(file_buff_o2, '"'); |
1043 | 121k | if (tag_val_length < HTML_STR_LENGTH) { |
1044 | 120k | tag_val[tag_val_length++] = '"'; |
1045 | 120k | } |
1046 | 121k | tag_val[tag_val_length] = '\0'; |
1047 | 121k | html_tag_arg_add(&tag_args, tag_arg, tag_val); |
1048 | 121k | ptr++; |
1049 | 121k | state = HTML_SKIP_WS; |
1050 | 121k | tag_arg_length = 0; |
1051 | 121k | next_state = HTML_TAG_ARG; |
1052 | 357k | } else { |
1053 | 357k | html_output_c(file_buff_o2, '"'); |
1054 | 357k | if (tag_val_length < HTML_STR_LENGTH) { |
1055 | 320k | tag_val[tag_val_length++] = '"'; |
1056 | 320k | } |
1057 | 357k | ptr++; |
1058 | 357k | } |
1059 | 479k | } |
1060 | 57.1M | } else if (*ptr == '"') { |
1061 | 1.03M | if (tag_val_length == 0) { |
1062 | 508k | quoted = DOUBLE_QUOTED; |
1063 | 508k | html_output_c(file_buff_o2, '"'); |
1064 | 508k | if (tag_val_length < HTML_STR_LENGTH) { |
1065 | 508k | tag_val[tag_val_length++] = '"'; |
1066 | 508k | } |
1067 | 508k | ptr++; |
1068 | 527k | } else { |
1069 | 527k | if (!escape && (quoted == DOUBLE_QUOTED)) { |
1070 | 160k | html_output_c(file_buff_o2, '"'); |
1071 | 160k | if (tag_val_length < HTML_STR_LENGTH) { |
1072 | 159k | tag_val[tag_val_length++] = '"'; |
1073 | 159k | } |
1074 | 160k | tag_val[tag_val_length] = '\0'; |
1075 | 160k | html_tag_arg_add(&tag_args, tag_arg, tag_val); |
1076 | 160k | ptr++; |
1077 | 160k | state = HTML_SKIP_WS; |
1078 | 160k | tag_arg_length = 0; |
1079 | 160k | next_state = HTML_TAG_ARG; |
1080 | 366k | } else { |
1081 | 366k | html_output_c(file_buff_o2, '"'); |
1082 | 366k | if (tag_val_length < HTML_STR_LENGTH) { |
1083 | 363k | tag_val[tag_val_length++] = '"'; |
1084 | 363k | } |
1085 | 366k | ptr++; |
1086 | 366k | } |
1087 | 527k | } |
1088 | 56.1M | } else if (isspace(*ptr) || (*ptr == '>')) { |
1089 | 2.50M | if (quoted == NOT_QUOTED) { |
1090 | 2.11M | tag_val[tag_val_length] = '\0'; |
1091 | 2.11M | html_tag_arg_add(&tag_args, tag_arg, tag_val); |
1092 | 2.11M | state = HTML_SKIP_WS; |
1093 | 2.11M | tag_arg_length = 0; |
1094 | 2.11M | next_state = HTML_TAG_ARG; |
1095 | 2.11M | } else { |
1096 | 395k | html_output_c(file_buff_o2, *ptr); |
1097 | 395k | if (tag_val_length < HTML_STR_LENGTH) { |
1098 | 393k | if (isspace(*ptr)) { |
1099 | 107k | tag_val[tag_val_length++] = ' '; |
1100 | 285k | } else { |
1101 | 285k | tag_val[tag_val_length++] = '>'; |
1102 | 285k | } |
1103 | 393k | } |
1104 | 395k | state = HTML_SKIP_WS; |
1105 | 395k | escape = false; |
1106 | 395k | quoted = NOT_QUOTED; |
1107 | 395k | next_state = HTML_TAG_ARG_VAL; |
1108 | 395k | ptr++; |
1109 | 395k | } |
1110 | 53.6M | } else { |
1111 | 53.6M | if (mbchar2 && (*ptr < 0x80 || mbchar2 >= 0x10000)) { |
1112 | 5.67M | if (mbchar2 == 0xE38082 || mbchar2 == 0xEFBC8E || mbchar2 == 0xEFB992 || |
1113 | 5.67M | (mbchar2 == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) { |
1114 | 174k | html_output_c(file_buff_o2, '.'); |
1115 | 174k | if (tag_val_length < HTML_STR_LENGTH) |
1116 | 172k | tag_val[tag_val_length++] = '.'; |
1117 | 174k | if (mbchar2 == 0xA1) { |
1118 | 149k | ptr++; |
1119 | 149k | mbchar2 = 0; |
1120 | 149k | continue; |
1121 | 149k | } |
1122 | 5.50M | } else { |
1123 | 5.50M | uint8_t c0 = mbchar2 >> 16; |
1124 | 5.50M | uint8_t c1 = (mbchar2 >> 8) & 0xff; |
1125 | 5.50M | uint8_t c2 = (mbchar2 & 0xff); |
1126 | 5.50M | if (c0) |
1127 | 3.12M | html_output_c(file_buff_o2, c0); |
1128 | 5.50M | if (c0 || c1) |
1129 | 3.69M | html_output_c(file_buff_o2, c1); |
1130 | 5.50M | html_output_c(file_buff_o2, c2); |
1131 | 5.50M | if (c0 && tag_val_length < HTML_STR_LENGTH) |
1132 | 2.72M | tag_val[tag_val_length++] = c0; |
1133 | 5.50M | if ((c0 || c1) && tag_val_length < HTML_STR_LENGTH) |
1134 | 3.28M | tag_val[tag_val_length++] = c1; |
1135 | 5.50M | if (tag_val_length < HTML_STR_LENGTH) |
1136 | 5.06M | tag_val[tag_val_length++] = c2; |
1137 | 5.50M | } |
1138 | 5.52M | mbchar2 = 0; |
1139 | 5.52M | } |
1140 | 53.4M | if (*ptr >= 0x80) |
1141 | 12.5M | mbchar2 = (mbchar2 << 8) | *ptr; |
1142 | 40.8M | else { |
1143 | 40.8M | html_output_c(file_buff_o2, tolower(*ptr)); |
1144 | 40.8M | if (tag_val_length < HTML_STR_LENGTH) { |
1145 | 37.9M | tag_val[tag_val_length++] = *ptr; |
1146 | 37.9M | } |
1147 | 40.8M | } |
1148 | 53.4M | ptr++; |
1149 | 53.4M | } |
1150 | | |
1151 | 58.9M | if (*ptr == '\\') { |
1152 | 68.1k | escape = true; |
1153 | 58.8M | } else { |
1154 | 58.8M | escape = false; |
1155 | 58.8M | } |
1156 | 58.9M | break; |
1157 | 70.5M | case HTML_COMMENT: |
1158 | 70.5M | if (in_tag != TAG_DONT_EXTRACT && !isspace(*ptr)) { |
1159 | 60.3M | unsigned char c = tolower(*ptr); |
1160 | | /* dump script to nocomment.html, since we no longer have |
1161 | | * comment.html/script.html */ |
1162 | 60.3M | if (c == '\'') c = '"'; |
1163 | 60.3M | html_output_c(file_buff_o2, c); |
1164 | 60.3M | } |
1165 | 70.5M | if (*ptr == '>') { |
1166 | 520k | state = HTML_SKIP_WS; |
1167 | 520k | next_state = HTML_NORM; |
1168 | 520k | } |
1169 | 70.5M | ptr++; |
1170 | 70.5M | break; |
1171 | 9.60M | case HTML_PROCESS_TAG: |
1172 | | |
1173 | | /* Default to no action for this tag */ |
1174 | 9.60M | state = HTML_SKIP_WS; |
1175 | 9.60M | next_state = HTML_NORM; |
1176 | 9.60M | if (tag[0] == '/') { |
1177 | | /* End tag */ |
1178 | 2.22M | state = HTML_SKIP_WS; |
1179 | 2.22M | next_state = HTML_NORM; |
1180 | | |
1181 | 2.22M | if (strcmp(tag, "/script") == 0) { |
1182 | 667k | in_tag = TAG_DONT_EXTRACT; |
1183 | 667k | if (js_state) { |
1184 | 184k | js_end = ptr; |
1185 | 184k | js_process(js_state, js_begin, js_end, line, ptr, in_tag, dirname); |
1186 | 184k | js_state = NULL; |
1187 | 184k | js_begin = js_end = NULL; |
1188 | 184k | } |
1189 | | /*don't output newlines in nocomment.html |
1190 | | * html_output_c(file_buff_o2, '\n');*/ |
1191 | 1.55M | } else if ((strcmp(tag, "/style") == 0) && (in_tag == TAG_STYLE)) { |
1192 | 19.8k | size_t chunk_size; |
1193 | | |
1194 | 19.8k | style_end = ptr - strlen("</style>"); |
1195 | | |
1196 | 19.8k | if (style_end < style_begin) { |
1197 | 416 | cli_dbgmsg("cli_html_normalise: style chunk size underflow\n"); |
1198 | 416 | goto done; |
1199 | 416 | } |
1200 | | |
1201 | 19.4k | chunk_size = style_end - style_begin; |
1202 | | |
1203 | 19.4k | if (style_buff == NULL) { |
1204 | 14.4k | CLI_MAX_MALLOC_OR_GOTO_DONE(style_buff, chunk_size + 1); |
1205 | 14.4k | } else { |
1206 | 5.03k | CLI_MAX_REALLOC_OR_GOTO_DONE(style_buff, style_buff_size + chunk_size + 1); |
1207 | 5.03k | } |
1208 | | |
1209 | 19.4k | memcpy(style_buff + style_buff_size, style_begin, chunk_size); |
1210 | | |
1211 | 19.4k | style_buff_size += chunk_size; |
1212 | 19.4k | style_buff[style_buff_size] = '\0'; |
1213 | | |
1214 | 19.4k | in_tag = TAG_DONT_EXTRACT; |
1215 | 19.4k | style_begin = style_end = NULL; |
1216 | 19.4k | } |
1217 | | |
1218 | 2.22M | if (hrefs && hrefs->scanContents && in_ahref) { |
1219 | 0 | if (strcmp(tag, "/a") == 0) { |
1220 | 0 | html_tag_contents_done(hrefs, in_ahref, &contents); |
1221 | 0 | in_ahref = 0; /* we are no longer inside an <a href> |
1222 | | nesting <a> tags not supported, and shouldn't be supported*/ |
1223 | 0 | } |
1224 | 0 | href_contents_begin = ptr; |
1225 | 0 | } |
1226 | 2.22M | if (strcmp(tag, "/form") == 0) { |
1227 | 790 | if (in_form_action) |
1228 | 0 | free(in_form_action); |
1229 | 790 | in_form_action = NULL; |
1230 | 790 | } |
1231 | 7.38M | } else if (strcmp(tag, "script") == 0) { |
1232 | 364k | arg_value = html_tag_arg_value(&tag_args, "language"); |
1233 | | /* TODO: maybe we can output all tags only via html_output_tag */ |
1234 | 364k | if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0)) { |
1235 | 7.10k | html_tag_arg_set(&tag_args, "language", "javascript"); |
1236 | 7.10k | state = HTML_SKIP_WS; |
1237 | 7.10k | next_state = HTML_JSDECODE; |
1238 | | /* we already output the old tag, output the new tag now */ |
1239 | 7.10k | html_output_tag(file_buff_o2, tag, &tag_args); |
1240 | 357k | } else if (arg_value && (strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) { |
1241 | 7.65k | html_tag_arg_set(&tag_args, "language", "vbscript"); |
1242 | 7.65k | state = HTML_SKIP_WS; |
1243 | 7.65k | next_state = HTML_JSDECODE; |
1244 | | /* we already output the old tag, output the new tag now */ |
1245 | 7.65k | html_output_tag(file_buff_o2, tag, &tag_args); |
1246 | 7.65k | } |
1247 | 364k | in_tag = TAG_SCRIPT; |
1248 | 364k | if (dconf_js && !js_state) { |
1249 | 364k | js_state = cli_js_init(); |
1250 | 364k | if (!js_state) { |
1251 | 0 | cli_dbgmsg("htmlnorm: Failed to initialize js parser\n"); |
1252 | 0 | } |
1253 | 364k | js_begin = ptr; |
1254 | 364k | js_end = NULL; |
1255 | 364k | } |
1256 | 7.01M | } else if (strcmp(tag, "style") == 0) { |
1257 | 35.7k | in_tag = TAG_STYLE; |
1258 | 35.7k | style_begin = ptr; |
1259 | 35.7k | style_end = NULL; |
1260 | 6.98M | } else if (strcmp(tag, "%@") == 0) { |
1261 | 675k | arg_value = html_tag_arg_value(&tag_args, "language"); |
1262 | 675k | if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0 || |
1263 | 486k | strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) { |
1264 | | |
1265 | 202k | saved_next_state = next_state; |
1266 | 202k | next_state = state; |
1267 | 202k | look_for_screnc = false; |
1268 | 202k | state = HTML_LOOKFOR_SCRENC; |
1269 | 202k | } |
1270 | 6.30M | } else if (hrefs) { |
1271 | 0 | if (in_ahref && !href_contents_begin) |
1272 | 0 | href_contents_begin = ptr; |
1273 | 0 | if (strcmp(tag, "a") == 0) { |
1274 | 0 | arg_value = html_tag_arg_value(&tag_args, "href"); |
1275 | 0 | if (arg_value && strlen((const char *)arg_value) > 0) { |
1276 | 0 | if (hrefs->scanContents) { |
1277 | 0 | char *arg_value_title = html_tag_arg_value(&tag_args, "title"); |
1278 | | /*beginning of an <a> tag*/ |
1279 | 0 | if (in_ahref) |
1280 | | /*we encountered nested <a> tags, pretend previous closed*/ |
1281 | 0 | if (href_contents_begin) { |
1282 | 0 | html_tag_contents_append(&contents, href_contents_begin, ptrend); |
1283 | | /*add pending contents between tags*/ |
1284 | 0 | html_tag_contents_done(hrefs, in_ahref, &contents); |
1285 | 0 | in_ahref = 0; |
1286 | 0 | } |
1287 | 0 | if (arg_value_title) { |
1288 | | /* title is a 'displayed link'*/ |
1289 | 0 | html_tag_arg_add(hrefs, "href_title", arg_value_title); |
1290 | 0 | html_tag_contents_append(&contents, (const unsigned char *)arg_value, |
1291 | 0 | (const unsigned char *)arg_value + strlen(arg_value)); |
1292 | 0 | html_tag_contents_done(hrefs, hrefs->count, &contents); |
1293 | 0 | } |
1294 | 0 | if (in_form_action) { |
1295 | | /* form action is the real URL, and href is the 'displayed' */ |
1296 | 0 | html_tag_arg_add(hrefs, "form", arg_value); |
1297 | 0 | contents.pos = 0; |
1298 | 0 | html_tag_contents_append(&contents, in_form_action, |
1299 | 0 | in_form_action + strlen((const char *)in_form_action)); |
1300 | 0 | html_tag_contents_done(hrefs, hrefs->count, &contents); |
1301 | 0 | } |
1302 | 0 | } |
1303 | 0 | html_tag_arg_add(hrefs, "href", arg_value); |
1304 | 0 | if (hrefs->scanContents) { |
1305 | 0 | in_ahref = hrefs->count; /* index of this tag (counted from 1) */ |
1306 | 0 | href_contents_begin = ptr; /* contents begin after <a ..> ends */ |
1307 | 0 | contents.pos = 0; |
1308 | 0 | } |
1309 | 0 | } |
1310 | 0 | } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) { |
1311 | 0 | const char *arg_action_value = html_tag_arg_value(&tag_args, "action"); |
1312 | 0 | if (arg_action_value) { |
1313 | 0 | if (in_form_action) |
1314 | 0 | free(in_form_action); |
1315 | 0 | in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value); |
1316 | 0 | } |
1317 | 0 | } else if (strcmp(tag, "img") == 0) { |
1318 | 0 | arg_value = html_tag_arg_value(&tag_args, "src"); |
1319 | 0 | if (arg_value && strlen(arg_value) > 0) { |
1320 | 0 | html_tag_arg_add(hrefs, "src", arg_value); |
1321 | 0 | if (hrefs->scanContents && in_ahref) |
1322 | | /* "contents" of an img tag, is the URL of its parent <a> tag */ |
1323 | 0 | hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]); |
1324 | 0 | if (in_form_action) { |
1325 | | /* form action is the real URL, and href is the 'displayed' */ |
1326 | 0 | html_tag_arg_add(hrefs, "form", arg_value); |
1327 | 0 | contents.pos = 0; |
1328 | 0 | html_tag_contents_append(&contents, in_form_action, |
1329 | 0 | in_form_action + strlen((const char *)in_form_action)); |
1330 | 0 | html_tag_contents_done(hrefs, hrefs->count, &contents); |
1331 | 0 | } |
1332 | 0 | } |
1333 | 0 | arg_value = html_tag_arg_value(&tag_args, "dynsrc"); |
1334 | 0 | if (arg_value && strlen(arg_value) > 0) { |
1335 | 0 | html_tag_arg_add(hrefs, "dynsrc", arg_value); |
1336 | 0 | if (hrefs->scanContents && in_ahref) |
1337 | | /* see above */ |
1338 | 0 | hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]); |
1339 | 0 | if (in_form_action) { |
1340 | | /* form action is the real URL, and href is the 'displayed' */ |
1341 | 0 | html_tag_arg_add(hrefs, "form", arg_value); |
1342 | 0 | contents.pos = 0; |
1343 | 0 | html_tag_contents_append(&contents, in_form_action, |
1344 | 0 | in_form_action + strlen((const char *)in_form_action)); |
1345 | 0 | html_tag_contents_done(hrefs, hrefs->count, &contents); |
1346 | 0 | } |
1347 | 0 | } |
1348 | 0 | } else if (strcmp(tag, "iframe") == 0) { |
1349 | 0 | arg_value = html_tag_arg_value(&tag_args, "src"); |
1350 | 0 | if (arg_value && strlen(arg_value) > 0) { |
1351 | 0 | html_tag_arg_add(hrefs, "iframe", arg_value); |
1352 | 0 | if (hrefs->scanContents && in_ahref) |
1353 | | /* see above */ |
1354 | 0 | hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]); |
1355 | 0 | if (in_form_action) { |
1356 | | /* form action is the real URL, and href is the 'displayed' */ |
1357 | 0 | html_tag_arg_add(hrefs, "form", arg_value); |
1358 | 0 | contents.pos = 0; |
1359 | 0 | html_tag_contents_append(&contents, in_form_action, |
1360 | 0 | in_form_action + strlen((const char *)in_form_action)); |
1361 | 0 | html_tag_contents_done(hrefs, hrefs->count, &contents); |
1362 | 0 | } |
1363 | 0 | } |
1364 | 0 | } else if (strcmp(tag, "area") == 0) { |
1365 | 0 | arg_value = html_tag_arg_value(&tag_args, "href"); |
1366 | 0 | if (arg_value && strlen(arg_value) > 0) { |
1367 | 0 | html_tag_arg_add(hrefs, "area", arg_value); |
1368 | 0 | if (hrefs->scanContents && in_ahref) |
1369 | | /* see above */ |
1370 | 0 | hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]); |
1371 | 0 | if (in_form_action) { |
1372 | | /* form action is the real URL, and href is the 'displayed' */ |
1373 | 0 | html_tag_arg_add(hrefs, "form", arg_value); |
1374 | 0 | contents.pos = 0; |
1375 | 0 | html_tag_contents_append(&contents, in_form_action, |
1376 | 0 | in_form_action + strlen((const char *)in_form_action)); |
1377 | 0 | html_tag_contents_done(hrefs, hrefs->count, &contents); |
1378 | 0 | } |
1379 | 0 | } |
1380 | 0 | } |
1381 | | /* TODO:imagemaps can have urls too */ |
1382 | 6.30M | } else if (strcmp(tag, "a") == 0) { |
1383 | | /* a/img tags for buff_text can be processed only if we're not processing hrefs */ |
1384 | 744k | arg_value = html_tag_arg_value(&tag_args, "href"); |
1385 | 744k | if (arg_value && arg_value[0]) { |
1386 | 539k | html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value)); |
1387 | 539k | html_output_c(file_buff_text, ' '); |
1388 | 539k | text_space_written = true; |
1389 | 539k | } |
1390 | 5.56M | } else if (strcmp(tag, "img") == 0) { |
1391 | 94.2k | arg_value = html_tag_arg_value(&tag_args, "src"); |
1392 | 94.2k | if (arg_value && arg_value[0]) { |
1393 | 15.9k | html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value)); |
1394 | 15.9k | html_output_c(file_buff_text, ' '); |
1395 | 15.9k | text_space_written = true; |
1396 | 15.9k | } |
1397 | 94.2k | } |
1398 | 9.60M | html_tag_arg_free(&tag_args); |
1399 | 9.60M | break; |
1400 | 10.8M | case HTML_CHAR_REF: |
1401 | 10.8M | if (*ptr == '#') { |
1402 | 2.96M | value = 0; |
1403 | 2.96M | hex = false; |
1404 | 2.96M | state = HTML_CHAR_REF_DECODE; |
1405 | 2.96M | ptr++; |
1406 | 7.89M | } else { |
1407 | 7.89M | if (dconf_entconv) |
1408 | 7.89M | state = HTML_ENTITY_REF_DECODE; |
1409 | 0 | else { |
1410 | 0 | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1411 | 0 | tag_val[tag_val_length++] = '&'; |
1412 | 0 | } |
1413 | 0 | html_output_c(file_buff_o2, '&'); |
1414 | |
|
1415 | 0 | state = next_state; |
1416 | 0 | next_state = HTML_BAD_STATE; |
1417 | 0 | } |
1418 | 7.89M | } |
1419 | 10.8M | break; |
1420 | 19.7M | case HTML_ENTITY_REF_DECODE: |
1421 | 19.7M | if (*ptr == ';') { |
1422 | 1.14M | size_t i; |
1423 | 1.14M | const char *normalized; |
1424 | 1.14M | entity_val[entity_val_length] = '\0'; |
1425 | 1.14M | normalized = entity_norm(&conv, entity_val); |
1426 | 1.14M | if (normalized) { |
1427 | 527k | for (i = 0; i < strlen(normalized); i++) { |
1428 | 425k | const unsigned char c = normalized[i] & 0xff; |
1429 | 425k | html_output_c(file_buff_o2, c); |
1430 | 425k | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1431 | 59.3k | tag_val[tag_val_length++] = c; |
1432 | 59.3k | } |
1433 | 425k | } |
1434 | 1.04M | } else { |
1435 | 1.04M | html_output_c(file_buff_o2, '&'); |
1436 | 1.04M | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1437 | 270k | tag_val[tag_val_length++] = '&'; |
1438 | 270k | } |
1439 | 2.57M | for (i = 0; i < entity_val_length; i++) { |
1440 | 1.52M | const char c = tolower(entity_val[i]); |
1441 | 1.52M | html_output_c(file_buff_o2, c); |
1442 | 1.52M | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1443 | 290k | tag_val[tag_val_length++] = c; |
1444 | 290k | } |
1445 | 1.52M | } |
1446 | 1.04M | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1447 | 270k | tag_val[tag_val_length++] = ';'; |
1448 | 270k | } |
1449 | 1.04M | html_output_c(file_buff_o2, ';'); |
1450 | 1.04M | } |
1451 | 1.14M | entity_val_length = 0; |
1452 | 1.14M | state = next_state; |
1453 | 1.14M | next_state = HTML_BAD_STATE; |
1454 | 1.14M | ptr++; |
1455 | 18.6M | } else if ((isalnum(*ptr) || *ptr == '_' || *ptr == ':' || (*ptr == '-')) && entity_val_length < HTML_STR_LENGTH) { |
1456 | 11.8M | entity_val[entity_val_length++] = *ptr++; |
1457 | 11.8M | } else { |
1458 | | /* entity too long, or not valid, dump it */ |
1459 | 6.74M | size_t i; |
1460 | 6.74M | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1461 | 468k | tag_val[tag_val_length++] = '&'; |
1462 | 468k | } |
1463 | 6.74M | html_output_c(file_buff_o2, '&'); |
1464 | 16.5M | for (i = 0; i < entity_val_length; i++) { |
1465 | 9.75M | const char c = tolower(entity_val[i]); |
1466 | 9.75M | html_output_c(file_buff_o2, c); |
1467 | 9.75M | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1468 | 836k | tag_val[tag_val_length++] = c; |
1469 | 836k | } |
1470 | 9.75M | } |
1471 | | |
1472 | 6.74M | state = next_state; |
1473 | 6.74M | next_state = HTML_BAD_STATE; |
1474 | 6.74M | entity_val_length = 0; |
1475 | 6.74M | } |
1476 | 19.7M | break; |
1477 | 9.67M | case HTML_CHAR_REF_DECODE: |
1478 | 9.67M | if ((value == 0) && ((*ptr == 'x') || (*ptr == 'X'))) { |
1479 | 217k | hex = true; |
1480 | 217k | ptr++; |
1481 | 9.46M | } else if (*ptr == ';') { |
1482 | 2.02M | if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { |
1483 | 164k | tag_val[tag_val_length++] = value; /* store encoded values too */ |
1484 | 164k | } |
1485 | 2.02M | if (dconf_entconv) { |
1486 | | |
1487 | 2.02M | if (value < 0x80) |
1488 | 1.82M | html_output_c(file_buff_o2, tolower(value)); |
1489 | 197k | else { |
1490 | 197k | unsigned char buff[10]; |
1491 | 197k | unsigned char *out = u16_normalize_tobuffer(value, buff, 10); |
1492 | 197k | if (out && out > buff) { |
1493 | 197k | html_output_str(file_buff_o2, buff, out - buff - 1); |
1494 | 197k | } |
1495 | 197k | } |
1496 | 2.02M | } else |
1497 | 0 | html_output_c(file_buff_o2, tolower(value & 0xff)); |
1498 | 2.02M | state = next_state; |
1499 | 2.02M | next_state = HTML_BAD_STATE; |
1500 | 2.02M | ptr++; |
1501 | 7.44M | } else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) { |
1502 | 6.52M | int64_t increment = 0; |
1503 | | |
1504 | 6.52M | if (hex && value < INT64_MAX / 16) { |
1505 | 269k | value *= 16; |
1506 | 6.25M | } else if (value < INT64_MAX / 10) { |
1507 | 6.23M | value *= 10; |
1508 | 6.23M | } else { |
1509 | 27.6k | html_output_c(file_buff_o2, value); |
1510 | 27.6k | state = next_state; |
1511 | 27.6k | next_state = HTML_BAD_STATE; |
1512 | 27.6k | ptr++; |
1513 | 27.6k | break; |
1514 | 27.6k | } |
1515 | 6.50M | if (isdigit(*ptr)) { |
1516 | 6.39M | increment = *ptr - '0'; |
1517 | 6.39M | } else { |
1518 | 104k | increment = tolower(*ptr) - 'a' + 10; |
1519 | 104k | } |
1520 | 6.50M | if (value > INT64_MAX - increment) { |
1521 | | /* Addition would result in integer overflow. */ |
1522 | 0 | html_output_c(file_buff_o2, value); |
1523 | 0 | state = next_state; |
1524 | 0 | next_state = HTML_BAD_STATE; |
1525 | 0 | ptr++; |
1526 | 0 | break; |
1527 | 0 | } |
1528 | 6.50M | value += increment; |
1529 | 6.50M | ptr++; |
1530 | 6.50M | } else { |
1531 | 912k | html_output_c(file_buff_o2, value); |
1532 | 912k | state = next_state; |
1533 | 912k | next_state = HTML_BAD_STATE; |
1534 | 912k | } |
1535 | 9.65M | break; |
1536 | 9.65M | case HTML_LOOKFOR_SCRENC: |
1537 | 278k | look_for_screnc = true; |
1538 | 278k | ptr_screnc = (unsigned char *)strstr((char *)ptr, "#@~^"); |
1539 | 278k | if (ptr_screnc) { |
1540 | 263k | ptr_screnc[0] = '/'; |
1541 | 263k | ptr_screnc[1] = '/'; |
1542 | 263k | ptr_screnc += 4; |
1543 | 263k | } |
1544 | 278k | state = next_state; |
1545 | 278k | next_state = saved_next_state; |
1546 | 278k | break; |
1547 | 7.07M | case HTML_JSDECODE: |
1548 | | /* Check for start marker */ |
1549 | 7.07M | if (strncmp((const char *)ptr, "#@~^", 4) == 0) { |
1550 | 4.94k | ptr[0] = '/'; |
1551 | 4.94k | ptr[1] = '/'; |
1552 | 4.94k | ptr += 4; |
1553 | 4.94k | state = HTML_JSDECODE_LENGTH; |
1554 | 4.94k | next_state = HTML_BAD_STATE; |
1555 | 7.06M | } else { |
1556 | 7.06M | html_output_c(file_buff_o2, tolower(*ptr)); |
1557 | 7.06M | ptr++; |
1558 | 7.06M | } |
1559 | 7.07M | break; |
1560 | 248k | case HTML_JSDECODE_LENGTH: |
1561 | 248k | if (strlen((const char *)ptr) < 8) { |
1562 | 837 | state = HTML_NORM; |
1563 | 837 | next_state = HTML_BAD_STATE; |
1564 | 837 | break; |
1565 | 837 | } |
1566 | 247k | memset(&screnc_state, 0, sizeof(screnc_state)); |
1567 | 247k | screnc_state.length = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2; |
1568 | 247k | screnc_state.length += base64_chars[ptr[1]] >> 4; |
1569 | 247k | screnc_state.length += (base64_chars[ptr[1]] & 0x0f) << 12; |
1570 | 247k | screnc_state.length += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8; |
1571 | 247k | screnc_state.length += (base64_chars[ptr[2]] & 0x03) << 22; |
1572 | 247k | screnc_state.length += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16; |
1573 | 247k | screnc_state.length += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24; |
1574 | 247k | screnc_state.length += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24; |
1575 | 247k | state = HTML_JSDECODE_DECRYPT; |
1576 | 247k | in_screnc = true; |
1577 | 247k | next_state = HTML_BAD_STATE; |
1578 | | /* for JS normalizer */ |
1579 | 247k | ptr[7] = '\n'; |
1580 | 247k | ptr += 8; |
1581 | 247k | break; |
1582 | 268k | case HTML_JSDECODE_DECRYPT: |
1583 | 268k | screnc_decode(ptr, &screnc_state); |
1584 | 268k | if (!screnc_state.length) { |
1585 | 210k | state = HTML_NORM; |
1586 | 210k | next_state = HTML_BAD_STATE; |
1587 | 210k | in_screnc = false; |
1588 | 210k | break; |
1589 | 210k | } else { |
1590 | 57.9k | state = HTML_NORM; |
1591 | 57.9k | next_state = HTML_BAD_STATE; |
1592 | 57.9k | } |
1593 | 57.9k | break; |
1594 | 5.79M | case HTML_RFC2397_TYPE: |
1595 | 5.79M | if (*ptr == '\'') { |
1596 | 121k | if (!escape && (quoted == SINGLE_QUOTED)) { |
1597 | | /* Early end of data detected. Error */ |
1598 | 3.91k | ptr++; |
1599 | 3.91k | state = HTML_SKIP_WS; |
1600 | 3.91k | tag_arg_length = 0; |
1601 | 3.91k | next_state = HTML_TAG_ARG; |
1602 | 117k | } else { |
1603 | 117k | if (tag_val_length < HTML_STR_LENGTH) { |
1604 | 106k | tag_val[tag_val_length++] = '"'; |
1605 | 106k | } |
1606 | 117k | ptr++; |
1607 | 117k | } |
1608 | 5.67M | } else if (*ptr == '"') { |
1609 | 1.22M | if (!escape && (quoted == DOUBLE_QUOTED)) { |
1610 | | /* Early end of data detected. Error */ |
1611 | 4.84k | ptr++; |
1612 | 4.84k | state = HTML_SKIP_WS; |
1613 | 4.84k | tag_arg_length = 0; |
1614 | 4.84k | next_state = HTML_TAG_ARG; |
1615 | 1.22M | } else { |
1616 | 1.22M | if (tag_val_length < HTML_STR_LENGTH) { |
1617 | 447k | tag_val[tag_val_length++] = '"'; |
1618 | 447k | } |
1619 | 1.22M | ptr++; |
1620 | 1.22M | } |
1621 | 4.44M | } else if (isspace(*ptr) || (*ptr == '>')) { |
1622 | 45.4k | if (quoted == NOT_QUOTED) { |
1623 | | /* Early end of data detected. Error */ |
1624 | 22.0k | state = HTML_SKIP_WS; |
1625 | 22.0k | tag_arg_length = 0; |
1626 | 22.0k | next_state = HTML_TAG_ARG; |
1627 | 23.3k | } else { |
1628 | 23.3k | if (tag_val_length < HTML_STR_LENGTH) { |
1629 | 22.4k | if (isspace(*ptr)) { |
1630 | 5.96k | tag_val[tag_val_length++] = ' '; |
1631 | 16.5k | } else { |
1632 | 16.5k | tag_val[tag_val_length++] = '>'; |
1633 | 16.5k | } |
1634 | 22.4k | } |
1635 | 23.3k | state = HTML_SKIP_WS; |
1636 | 23.3k | escape = false; |
1637 | 23.3k | quoted = NOT_QUOTED; |
1638 | 23.3k | next_state = HTML_RFC2397_TYPE; |
1639 | 23.3k | ptr++; |
1640 | 23.3k | } |
1641 | 4.39M | } else if (*ptr == ',') { |
1642 | | /* Beginning of data */ |
1643 | 107k | tag_val[tag_val_length] = '\0'; |
1644 | 107k | state = HTML_RFC2397_INIT; |
1645 | 107k | escape = false; |
1646 | 107k | next_state = HTML_BAD_STATE; |
1647 | 107k | ptr++; |
1648 | | |
1649 | 4.28M | } else { |
1650 | 4.28M | if (tag_val_length < HTML_STR_LENGTH) { |
1651 | 3.59M | tag_val[tag_val_length++] = tolower(*ptr); |
1652 | 3.59M | } |
1653 | 4.28M | ptr++; |
1654 | 4.28M | } |
1655 | 5.79M | if (*ptr == '\\') { |
1656 | 11.1k | escape = true; |
1657 | 5.78M | } else { |
1658 | 5.78M | escape = false; |
1659 | 5.78M | } |
1660 | 5.79M | break; |
1661 | 107k | case HTML_RFC2397_INIT: |
1662 | 107k | if (dirname) { |
1663 | 107k | STATBUF statbuf; |
1664 | | |
1665 | 107k | if (NULL != file_tmp_o1) { |
1666 | 5.33k | if (file_tmp_o1->fd != -1) { |
1667 | 5.33k | html_output_flush(file_tmp_o1); |
1668 | 5.33k | close(file_tmp_o1->fd); |
1669 | 5.33k | file_tmp_o1->fd = -1; |
1670 | 5.33k | } |
1671 | 5.33k | free(file_tmp_o1); |
1672 | 5.33k | } |
1673 | | |
1674 | 107k | file_tmp_o1 = (file_buff_t *)malloc(sizeof(file_buff_t)); |
1675 | 107k | if (!file_tmp_o1) { |
1676 | 0 | cli_errmsg("cli_html_normalise: Unable to allocate memory for file_tmp_o1\n"); |
1677 | 0 | goto done; |
1678 | 0 | } |
1679 | 107k | file_tmp_o1->fd = -1; |
1680 | | |
1681 | | /* Create rfc2397 directory if it doesn't already exist */ |
1682 | 107k | snprintf(filename, 1024, "%s" PATHSEP "rfc2397", dirname); |
1683 | 107k | if (LSTAT(filename, &statbuf) == -1) { |
1684 | 17.0k | if (mkdir(filename, 0700) && errno != EEXIST) { |
1685 | 0 | cli_errmsg("Failed to create directory: %s\n", dirname); |
1686 | 0 | goto done; |
1687 | 0 | } |
1688 | 17.0k | } |
1689 | | |
1690 | 107k | tmp_file = cli_gentemp(filename); |
1691 | 107k | if (!tmp_file) { |
1692 | 0 | goto done; |
1693 | 0 | } |
1694 | 107k | cli_dbgmsg("RFC2397 data file: %s\n", tmp_file); |
1695 | 107k | file_tmp_o1->fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR); |
1696 | 107k | free(tmp_file); |
1697 | 107k | if (file_tmp_o1->fd < 0) { |
1698 | 0 | cli_dbgmsg("open failed: %s\n", filename); |
1699 | 0 | goto done; |
1700 | 0 | } |
1701 | 107k | file_tmp_o1->length = 0; |
1702 | | |
1703 | 107k | html_output_str(file_tmp_o1, (const unsigned char *)"From html-normalise\n", 20); |
1704 | 107k | html_output_str(file_tmp_o1, (const unsigned char *)"Content-type: ", 14); |
1705 | 107k | if ((tag_val_length == 0) && (*tag_val == ';')) { |
1706 | 0 | html_output_str(file_tmp_o1, (const unsigned char *)"text/plain\n", 11); |
1707 | 0 | } |
1708 | 107k | html_output_str(file_tmp_o1, (const unsigned char *)tag_val, tag_val_length); |
1709 | 107k | html_output_c(file_tmp_o1, '\n'); |
1710 | 107k | if (strstr(tag_val, ";base64") != NULL) { |
1711 | 3.49k | html_output_str(file_tmp_o1, (const unsigned char *)"Content-transfer-encoding: base64\n", 34); |
1712 | 3.49k | } |
1713 | 107k | html_output_c(file_tmp_o1, '\n'); |
1714 | 107k | } else { |
1715 | 0 | file_tmp_o1 = NULL; |
1716 | 0 | } |
1717 | 107k | state = HTML_RFC2397_DATA; |
1718 | 107k | binary = true; |
1719 | 107k | break; |
1720 | 57.3M | case HTML_RFC2397_DATA: |
1721 | 57.3M | if (*ptr == '&') { |
1722 | 71.8k | state = HTML_CHAR_REF; |
1723 | 71.8k | next_state = HTML_RFC2397_DATA; |
1724 | 71.8k | ptr++; |
1725 | 57.3M | } else if (*ptr == '%') { |
1726 | 414k | length = 0; |
1727 | 414k | value = 0; |
1728 | 414k | state = HTML_ESCAPE_CHAR; |
1729 | 414k | next_state = HTML_RFC2397_ESC; |
1730 | 414k | ptr++; |
1731 | 56.8M | } else if (*ptr == '\'') { |
1732 | 127k | if (!escape && (quoted == SINGLE_QUOTED)) { |
1733 | 68.6k | state = HTML_RFC2397_FINISH; |
1734 | 68.6k | ptr++; |
1735 | 68.6k | } else { |
1736 | 59.0k | html_output_c(file_tmp_o1, *ptr); |
1737 | 59.0k | ptr++; |
1738 | 59.0k | } |
1739 | 56.7M | } else if (*ptr == '\"') { |
1740 | 7.17M | if (!escape && (quoted == DOUBLE_QUOTED)) { |
1741 | 7.75k | state = HTML_RFC2397_FINISH; |
1742 | 7.75k | ptr++; |
1743 | 7.16M | } else { |
1744 | 7.16M | html_output_c(file_tmp_o1, *ptr); |
1745 | 7.16M | ptr++; |
1746 | 7.16M | } |
1747 | 49.5M | } else if (isspace(*ptr) || (*ptr == '>')) { |
1748 | 6.62M | if (quoted == NOT_QUOTED) { |
1749 | 20.3k | state = HTML_RFC2397_FINISH; |
1750 | 20.3k | ptr++; |
1751 | 6.60M | } else { |
1752 | 6.60M | html_output_c(file_tmp_o1, *ptr); |
1753 | 6.60M | ptr++; |
1754 | 6.60M | } |
1755 | 42.9M | } else { |
1756 | 42.9M | html_output_c(file_tmp_o1, *ptr); |
1757 | 42.9M | ptr++; |
1758 | 42.9M | } |
1759 | 57.3M | if (*ptr == '\\') { |
1760 | 62.3k | escape = true; |
1761 | 57.3M | } else { |
1762 | 57.3M | escape = false; |
1763 | 57.3M | } |
1764 | 57.3M | break; |
1765 | 96.8k | case HTML_RFC2397_FINISH: |
1766 | 96.8k | if (file_tmp_o1) { |
1767 | 96.8k | if (file_tmp_o1->fd != -1) { |
1768 | 96.8k | html_output_flush(file_tmp_o1); |
1769 | 96.8k | close(file_tmp_o1->fd); |
1770 | 96.8k | file_tmp_o1->fd = -1; |
1771 | 96.8k | } |
1772 | 96.8k | free(file_tmp_o1); |
1773 | 96.8k | file_tmp_o1 = NULL; |
1774 | 96.8k | } |
1775 | 96.8k | state = HTML_SKIP_WS; |
1776 | 96.8k | escape = false; |
1777 | 96.8k | quoted = NOT_QUOTED; |
1778 | 96.8k | next_state = HTML_TAG_ARG; |
1779 | 96.8k | binary = false; |
1780 | 96.8k | break; |
1781 | 414k | case HTML_RFC2397_ESC: |
1782 | 414k | if (length == 2) { |
1783 | 52.5k | html_output_c(file_tmp_o1, value); |
1784 | 361k | } else if (length == 1) { |
1785 | 361k | html_output_c(file_tmp_o1, '%'); |
1786 | 361k | html_output_c(file_tmp_o1, value + '0'); |
1787 | 361k | } else { |
1788 | 0 | html_output_c(file_tmp_o1, '%'); |
1789 | 0 | } |
1790 | 414k | state = HTML_RFC2397_DATA; |
1791 | 414k | break; |
1792 | 466k | case HTML_ESCAPE_CHAR: |
1793 | 466k | if (value < INT64_MAX / 16) { |
1794 | 466k | value *= 16; |
1795 | 466k | } else { |
1796 | 0 | state = next_state; |
1797 | 0 | next_state = HTML_BAD_STATE; |
1798 | 0 | ptr++; |
1799 | 0 | break; |
1800 | 0 | } |
1801 | 466k | length++; |
1802 | 466k | if (isxdigit(*ptr)) { |
1803 | 76.6k | if (isdigit(*ptr)) { |
1804 | 63.2k | value += (*ptr - '0'); |
1805 | 63.2k | } else { |
1806 | 13.4k | value += (tolower(*ptr) - 'a' + 10); |
1807 | 13.4k | } |
1808 | 389k | } else { |
1809 | 389k | state = next_state; |
1810 | 389k | } |
1811 | 466k | if (length == 2) { |
1812 | 52.5k | state = next_state; |
1813 | 52.5k | } |
1814 | 466k | ptr++; |
1815 | 466k | break; |
1816 | 3.18G | } |
1817 | 3.18G | } |
1818 | 699k | if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin) |
1819 | | /* end of line, append contents now, resume on next line */ |
1820 | 0 | html_tag_contents_append(&contents, href_contents_begin, ptr); |
1821 | 699k | ptrend = NULL; |
1822 | | |
1823 | 699k | if (js_state) { |
1824 | 455k | js_process(js_state, js_begin, js_end, line, ptr, in_tag, dirname); |
1825 | 455k | js_begin = js_end = NULL; |
1826 | 455k | if (in_tag == TAG_DONT_EXTRACT) { |
1827 | 0 | js_state = NULL; |
1828 | 0 | } |
1829 | 455k | } |
1830 | | |
1831 | 699k | if (in_tag == TAG_STYLE) { |
1832 | 17.2k | if (ptr < style_begin) { |
1833 | 210 | cli_dbgmsg("cli_html_normalise: style chunk size underflow\n"); |
1834 | 210 | goto done; |
1835 | 210 | } |
1836 | | |
1837 | 17.0k | size_t chunk_size = ptr - style_begin; |
1838 | | |
1839 | 17.0k | if (style_buff == NULL) { |
1840 | 2.14k | CLI_MAX_MALLOC_OR_GOTO_DONE(style_buff, chunk_size + 1); |
1841 | 14.9k | } else { |
1842 | 14.9k | CLI_MAX_REALLOC_OR_GOTO_DONE(style_buff, style_buff_size + chunk_size + 1); |
1843 | 14.9k | } |
1844 | | |
1845 | 17.0k | memcpy(style_buff + style_buff_size, style_begin, chunk_size); |
1846 | | |
1847 | 17.0k | style_buff_size += chunk_size; |
1848 | 17.0k | style_buff[style_buff_size] = '\0'; |
1849 | 17.0k | } |
1850 | | |
1851 | 699k | if (look_for_screnc && ptr_screnc) { |
1852 | | /* start found, and stuff before it already processed */ |
1853 | 243k | ptr = ptr_screnc; |
1854 | 243k | ptr_screnc = NULL; |
1855 | 243k | state = HTML_JSDECODE_LENGTH; |
1856 | 243k | next_state = HTML_BAD_STATE; |
1857 | 243k | continue; |
1858 | 243k | } |
1859 | 455k | free(line); |
1860 | 455k | ptr = line = cli_readchunk(stream_in, m_area, 8192); |
1861 | | |
1862 | 455k | if (in_tag == TAG_STYLE) { |
1863 | | // reset style_begin to start of the next line |
1864 | 14.8k | style_begin = line; |
1865 | 14.8k | } |
1866 | | |
1867 | 455k | if (in_screnc) { |
1868 | 56.1k | state = HTML_JSDECODE_DECRYPT; |
1869 | 56.1k | next_state = HTML_BAD_STATE; |
1870 | 399k | } else if (look_for_screnc && !ptr_screnc && |
1871 | 399k | state != HTML_LOOKFOR_SCRENC) { |
1872 | 127k | saved_next_state = next_state; |
1873 | 127k | next_state = state; |
1874 | 127k | state = HTML_LOOKFOR_SCRENC; |
1875 | 127k | } |
1876 | | |
1877 | 455k | if (next_state == state) { |
1878 | | /* safeguard against infloop */ |
1879 | 27.6k | cli_dbgmsg("htmlnorm.c: next_state == state, changing next_state\n"); |
1880 | 27.6k | next_state = HTML_BAD_STATE; |
1881 | 27.6k | } |
1882 | 455k | } |
1883 | | |
1884 | 259k | if (style_buff != NULL) { |
1885 | | // Found contents of <style> ... </style> block. |
1886 | | // Search it for images embedded in the CSS. |
1887 | 15.9k | cl_error_t ret = html_style_block_handler(ctx, (const char *)style_buff); |
1888 | 15.9k | if (CL_SUCCESS != ret) { |
1889 | 0 | cli_dbgmsg("Scan of image extracted from html <style> block returned: %s\n", cl_strerror(ret)); |
1890 | 0 | goto done; |
1891 | 0 | } |
1892 | | |
1893 | 15.9k | free(style_buff); |
1894 | 15.9k | style_buff = NULL; |
1895 | 15.9k | } |
1896 | | |
1897 | 259k | if (dconf_entconv) { |
1898 | | /* handle "unfinished" entities */ |
1899 | 259k | size_t i; |
1900 | 259k | const char *normalized; |
1901 | 259k | entity_val[entity_val_length] = '\0'; |
1902 | 259k | normalized = entity_norm(&conv, entity_val); |
1903 | 259k | if (normalized) { |
1904 | 8.40k | for (i = 0; i < strlen(normalized); i++) |
1905 | 7.46k | html_output_c(file_buff_o2, normalized[i] & 0xff); |
1906 | 258k | } else { |
1907 | 258k | if (entity_val_length) { |
1908 | 1.62k | html_output_c(file_buff_o2, '&'); |
1909 | 179k | for (i = 0; i < entity_val_length; i++) |
1910 | 177k | html_output_c(file_buff_o2, tolower(entity_val[i])); |
1911 | 1.62k | } |
1912 | 258k | } |
1913 | 259k | } |
1914 | | |
1915 | 259k | retval = true; |
1916 | | |
1917 | 259k | done: |
1918 | 259k | if (line) /* only needed for done case */ |
1919 | 626 | free(line); |
1920 | 259k | if (in_form_action) |
1921 | 0 | free(in_form_action); |
1922 | 259k | if (in_ahref) /* tag not closed, force closing */ |
1923 | 0 | html_tag_contents_done(hrefs, in_ahref, &contents); |
1924 | | |
1925 | 259k | if (js_state) { |
1926 | | /* output script so far */ |
1927 | 180k | cli_js_parse_done(js_state); |
1928 | 180k | cli_js_output(js_state, dirname); |
1929 | 180k | cli_js_destroy(js_state); |
1930 | 180k | js_state = NULL; |
1931 | 180k | } |
1932 | 259k | html_tag_arg_free(&tag_args); |
1933 | 259k | if (!m_area) { |
1934 | 0 | fclose(stream_in); |
1935 | 0 | } |
1936 | 259k | if (file_buff_o2) { |
1937 | 259k | html_output_flush(file_buff_o2); |
1938 | 259k | if (file_buff_o2->fd != -1) |
1939 | 259k | close(file_buff_o2->fd); |
1940 | 259k | free(file_buff_o2); |
1941 | 259k | } |
1942 | 259k | if (file_buff_text) { |
1943 | 259k | html_output_flush(file_buff_text); |
1944 | 259k | if (file_buff_text->fd != -1) |
1945 | 259k | close(file_buff_text->fd); |
1946 | 259k | free(file_buff_text); |
1947 | 259k | file_buff_text = NULL; |
1948 | 259k | } |
1949 | 259k | if (file_tmp_o1) { |
1950 | 5.33k | if (file_tmp_o1->fd != -1) { |
1951 | 5.33k | html_output_flush(file_tmp_o1); |
1952 | 5.33k | close(file_tmp_o1->fd); |
1953 | 5.33k | } |
1954 | 5.33k | free(file_tmp_o1); |
1955 | 5.33k | } |
1956 | 259k | if (style_buff != NULL) { |
1957 | 626 | free(style_buff); |
1958 | 626 | } |
1959 | 259k | return retval; |
1960 | 259k | } |
1961 | | |
1962 | | bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) |
1963 | 0 | { |
1964 | 0 | m_area_t m_area; |
1965 | |
|
1966 | 0 | m_area.buffer = in_buff; |
1967 | 0 | m_area.length = in_size; |
1968 | 0 | m_area.offset = 0; |
1969 | 0 | m_area.map = NULL; |
1970 | |
|
1971 | 0 | return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf); |
1972 | 0 | } |
1973 | | |
1974 | | bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) |
1975 | 259k | { |
1976 | 259k | bool retval = false; |
1977 | 259k | m_area_t m_area; |
1978 | | |
1979 | 259k | m_area.length = map->len; |
1980 | 259k | m_area.offset = 0; |
1981 | 259k | m_area.map = map; |
1982 | 259k | retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf); |
1983 | 259k | return retval; |
1984 | 259k | } |
1985 | | |
1986 | | bool html_screnc_decode(fmap_t *map, const char *dirname) |
1987 | 360k | { |
1988 | 360k | int count; |
1989 | 360k | bool retval = false; |
1990 | 360k | unsigned char *line = NULL, tmpstr[6]; |
1991 | 360k | unsigned char *ptr, filename[1024]; |
1992 | 360k | int ofd; |
1993 | 360k | struct screnc_state screnc_state; |
1994 | 360k | m_area_t m_area; |
1995 | | |
1996 | 360k | memset(&m_area, 0, sizeof(m_area)); |
1997 | 360k | m_area.length = map->len; |
1998 | 360k | m_area.offset = 0; |
1999 | 360k | m_area.map = map; |
2000 | | |
2001 | 360k | snprintf((char *)filename, 1024, "%s" PATHSEP "screnc.html", dirname); |
2002 | 360k | ofd = open((const char *)filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR); |
2003 | | |
2004 | 360k | if (ofd < 0) { |
2005 | 0 | cli_dbgmsg("open failed: %s\n", filename); |
2006 | 0 | return false; |
2007 | 0 | } |
2008 | | |
2009 | 360k | while ((line = cli_readchunk(NULL, &m_area, 8192)) != NULL) { |
2010 | 360k | ptr = (unsigned char *)strstr((char *)line, "#@~^"); |
2011 | 360k | if (ptr) { |
2012 | 360k | break; |
2013 | 360k | } |
2014 | 0 | free(line); |
2015 | 0 | line = NULL; |
2016 | 0 | } |
2017 | 360k | if (!line) { |
2018 | 0 | goto done; |
2019 | 0 | } |
2020 | | |
2021 | | /* Calculate the length of the encoded string */ |
2022 | 360k | ptr += 4; |
2023 | 360k | count = 0; |
2024 | 2.71M | do { |
2025 | 2.71M | if (!*ptr) { |
2026 | 66.8k | free(line); |
2027 | 66.8k | ptr = line = cli_readchunk(NULL, &m_area, 8192); |
2028 | 66.8k | if (!line) { |
2029 | 55.6k | goto done; |
2030 | 55.6k | } |
2031 | 66.8k | } |
2032 | 2.66M | if (count < 6) |
2033 | 2.03M | tmpstr[count] = *ptr; |
2034 | 2.66M | count++; |
2035 | 2.66M | ptr++; |
2036 | 2.66M | } while (count < 8); |
2037 | | |
2038 | 304k | memset(&screnc_state, 0, sizeof(screnc_state)); |
2039 | 304k | screnc_state.length = base64_chars[tmpstr[0]] < 0 ? 0 : base64_chars[tmpstr[0]] << 2; |
2040 | 304k | screnc_state.length += base64_chars[tmpstr[1]] >> 4; |
2041 | 304k | screnc_state.length += (base64_chars[tmpstr[1]] & 0x0f) << 12; |
2042 | 304k | screnc_state.length += ((base64_chars[tmpstr[2]] >> 2) < 0 ? 0 : (base64_chars[tmpstr[2]] >> 2)) << 8; |
2043 | 304k | screnc_state.length += (base64_chars[tmpstr[2]] & 0x03) << 22; |
2044 | 304k | screnc_state.length += base64_chars[tmpstr[3]] < 0 ? 0 : base64_chars[tmpstr[3]] << 16; |
2045 | 304k | screnc_state.length += (base64_chars[tmpstr[4]] < 0 ? 0 : base64_chars[tmpstr[4]] << 2) << 24; |
2046 | 304k | screnc_state.length += ((base64_chars[tmpstr[5]] >> 4) < 0 ? 0 : (base64_chars[tmpstr[5]] >> 4)) << 24; |
2047 | 304k | cli_writen(ofd, "<script>", strlen("<script>")); |
2048 | 725k | while (screnc_state.length && line) { |
2049 | 420k | screnc_decode(ptr, &screnc_state); |
2050 | 420k | cli_writen(ofd, ptr, strlen((const char *)ptr)); |
2051 | 420k | free(line); |
2052 | 420k | line = NULL; |
2053 | 420k | if (screnc_state.length) { |
2054 | 392k | ptr = line = cli_readchunk(NULL, &m_area, 8192); |
2055 | 392k | } |
2056 | 420k | } |
2057 | 304k | cli_writen(ofd, "</script>", strlen("</script>")); |
2058 | 304k | if (screnc_state.length) |
2059 | 265k | cli_dbgmsg("html_screnc_decode: missing %u bytes\n", screnc_state.length); |
2060 | 304k | retval = true; |
2061 | | |
2062 | 360k | done: |
2063 | 360k | close(ofd); |
2064 | 360k | if (line) { |
2065 | 10.9k | free(line); |
2066 | 10.9k | } |
2067 | 360k | return retval; |
2068 | 304k | } |