/src/clamav/libclamav/htmlnorm.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  *  Copyright (C) 2013-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved.  | 
3  |  |  *  Copyright (C) 2007-2013 Sourcefire, Inc.  | 
4  |  |  *  | 
5  |  |  *  Authors: Trog  | 
6  |  |  *  | 
7  |  |  *  Summary: Normalise HTML text. Decode MS Script Encoder protection.  | 
8  |  |  *           The ScrEnc decoder was initially based upon an analysis by Andreas Marx.  | 
9  |  |  *  | 
10  |  |  *  This program is free software; you can redistribute it and/or modify  | 
11  |  |  *  it under the terms of the GNU General Public License version 2 as  | 
12  |  |  *  published by the Free Software Foundation.  | 
13  |  |  *  | 
14  |  |  *  This program is distributed in the hope that it will be useful,  | 
15  |  |  *  but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
16  |  |  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  | 
17  |  |  *  GNU General Public License for more details.  | 
18  |  |  *  | 
19  |  |  *  You should have received a copy of the GNU General Public License  | 
20  |  |  *  along with this program; if not, write to the Free Software  | 
21  |  |  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,  | 
22  |  |  *  MA 02110-1301, USA.  | 
23  |  |  */  | 
24  |  |  | 
25  |  | #if HAVE_CONFIG_H  | 
26  |  | #include "clamav-config.h"  | 
27  |  | #endif  | 
28  |  |  | 
29  |  | #include <stdio.h>  | 
30  |  | #ifdef HAVE_UNISTD_H  | 
31  |  | #include <unistd.h>  | 
32  |  | #endif  | 
33  |  | #include <sys/types.h>  | 
34  |  | #include <sys/stat.h>  | 
35  |  | #include <fcntl.h>  | 
36  |  | #ifdef HAVE_STRINGS_H  | 
37  |  | #include <strings.h>  | 
38  |  | #endif  | 
39  |  | #include <string.h>  | 
40  |  | #include <errno.h>  | 
41  |  | #include <stdio.h>  | 
42  |  | #include <ctype.h>  | 
43  |  |  | 
44  |  | #include "clamav.h"  | 
45  |  | #include "fmap.h"  | 
46  |  | #include "others.h"  | 
47  |  | #include "htmlnorm.h"  | 
48  |  |  | 
49  |  | #include "entconv.h"  | 
50  |  | #include "jsparse/js-norm.h"  | 
51  |  |  | 
52  |  | #include "clamav_rust.h"  | 
53  |  | #include "scanners.h"  | 
54  |  |  | 
55  | 328M  | #define HTML_STR_LENGTH 1024  | 
56  | 0  | #define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH  | 
57  |  |  | 
58  |  | typedef enum { | 
59  |  |     HTML_BAD_STATE,  | 
60  |  |     HTML_NORM,  | 
61  |  |     HTML_8BIT,  | 
62  |  |     HTML_COMMENT,  | 
63  |  |     HTML_CHAR_REF,  | 
64  |  |     HTML_ENTITY_REF_DECODE,  | 
65  |  |     HTML_SKIP_WS,  | 
66  |  |     HTML_TRIM_WS,  | 
67  |  |     HTML_TAG,  | 
68  |  |     HTML_TAG_ARG,  | 
69  |  |     HTML_TAG_ARG_VAL,  | 
70  |  |     HTML_TAG_ARG_EQUAL,  | 
71  |  |     HTML_PROCESS_TAG,  | 
72  |  |     HTML_CHAR_REF_DECODE,  | 
73  |  |     HTML_LOOKFOR_SCRENC,  | 
74  |  |     HTML_JSDECODE,  | 
75  |  |     HTML_JSDECODE_LENGTH,  | 
76  |  |     HTML_JSDECODE_DECRYPT,  | 
77  |  |     HTML_SPECIAL_CHAR,  | 
78  |  |     HTML_RFC2397_TYPE,  | 
79  |  |     HTML_RFC2397_INIT,  | 
80  |  |     HTML_RFC2397_DATA,  | 
81  |  |     HTML_RFC2397_FINISH,  | 
82  |  |     HTML_RFC2397_ESC,  | 
83  |  |     HTML_ESCAPE_CHAR  | 
84  |  | } html_state;  | 
85  |  |  | 
86  |  | typedef enum { | 
87  |  |     TAG_DONT_EXTRACT,  | 
88  |  |     TAG_SCRIPT,  | 
89  |  |     TAG_STYLE,  | 
90  |  | } tag_type;  | 
91  |  |  | 
92  |  | typedef enum { | 
93  |  |     SINGLE_QUOTED,  | 
94  |  |     DOUBLE_QUOTED,  | 
95  |  |     NOT_QUOTED  | 
96  |  | } quoted_state;  | 
97  |  |  | 
98  | 3.30G  | #define HTML_FILE_BUFF_LEN 8192  | 
99  |  |  | 
100  |  | typedef struct file_buff_tag { | 
101  |  |     int fd;  | 
102  |  |     unsigned char buffer[HTML_FILE_BUFF_LEN];  | 
103  |  |     uint64_t length;  | 
104  |  | } file_buff_t;  | 
105  |  |  | 
106  |  | struct tag_contents { | 
107  |  |     size_t pos;  | 
108  |  |     unsigned char contents[MAX_TAG_CONTENTS_LENGTH + 1];  | 
109  |  | };  | 
110  |  |  | 
111  |  | // clang-format off  | 
112  |  | static const int64_t base64_chars[256] = { | 
113  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
114  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
115  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,  | 
116  |  |     52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,  | 
117  |  |     -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,  | 
118  |  |     15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,  | 
119  |  |     -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,  | 
120  |  |     41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,  | 
121  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
122  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
123  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
124  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
125  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
126  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
127  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
128  |  |     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,  | 
129  |  | };  | 
130  |  | // clang-format on  | 
131  |  |  | 
132  |  | int table_order[] = { | 
133  |  |     00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,  | 
134  |  |     00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,  | 
135  |  |     00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,  | 
136  |  |     00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02};  | 
137  |  |  | 
138  |  | int decrypt_tables[3][128] = { | 
139  |  |     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, | 
140  |  |      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,  | 
141  |  |      0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,  | 
142  |  |      0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,  | 
143  |  |      0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,  | 
144  |  |      0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,  | 
145  |  |      0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,  | 
146  |  |      0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},  | 
147  |  |  | 
148  |  |     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, | 
149  |  |      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,  | 
150  |  |      0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,  | 
151  |  |      0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,  | 
152  |  |      0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,  | 
153  |  |      0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,  | 
154  |  |      0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,  | 
155  |  |      0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},  | 
156  |  |  | 
157  |  |     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F, | 
158  |  |      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,  | 
159  |  |      0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,  | 
160  |  |      0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,  | 
161  |  |      0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,  | 
162  |  |      0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,  | 
163  |  |      0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,  | 
164  |  |      0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}};  | 
165  |  |  | 
166  |  | static inline unsigned int rewind_tospace(const unsigned char *chunk, unsigned int len)  | 
167  | 318k  | { | 
168  | 318k  |     unsigned int count = len;  | 
169  | 669M  |     while (!isspace(chunk[len - 1]) && (len > 1)) { | 
170  | 669M  |         len--;  | 
171  | 669M  |     }  | 
172  | 318k  |     if (len == 1) { | 
173  | 57.9k  |         return count;  | 
174  | 57.9k  |     }  | 
175  | 260k  |     return len;  | 
176  | 318k  | }  | 
177  |  |  | 
178  |  | /* read at most @max_len of data from @m_area or @stream, skipping NULL chars.  | 
179  |  |  * This used to be called cli_readline, but we don't stop at end-of-line anymore */  | 
180  |  | static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len)  | 
181  | 1.53M  | { | 
182  | 1.53M  |     unsigned char *chunk, *start, *ptr, *end;  | 
183  | 1.53M  |     unsigned int chunk_len, count;  | 
184  |  |  | 
185  | 1.53M  |     chunk = (unsigned char *)cli_max_malloc(max_len);  | 
186  | 1.53M  |     if (!chunk) { | 
187  | 0  |         cli_errmsg("readchunk: Unable to allocate memory for chunk\n"); | 
188  | 0  |         return NULL;  | 
189  | 0  |     }  | 
190  |  |  | 
191  |  |     /* Try to use the memory buffer first */  | 
192  | 1.53M  |     if (m_area) { | 
193  |  |         /* maximum we can copy into the buffer,  | 
194  |  |          * we could have less than max_len bytes available */  | 
195  | 1.53M  |         chunk_len = MIN(m_area->length - m_area->offset, max_len - 1);  | 
196  | 1.53M  |         if (!chunk_len) { | 
197  | 580k  |             free(chunk);  | 
198  | 580k  |             return NULL;  | 
199  | 580k  |         }  | 
200  | 954k  |         if (m_area->map)  | 
201  | 954k  |             ptr = (unsigned char *)fmap_need_off_once(m_area->map, m_area->offset, chunk_len);  | 
202  | 0  |         else  | 
203  | 0  |             ptr = m_area->buffer + m_area->offset;  | 
204  | 954k  |         start = ptr;  | 
205  | 954k  |         end   = ptr - m_area->offset + m_area->length;  | 
206  |  |  | 
207  | 954k  |         if ((start >= end) || !start) { | 
208  | 0  |             free(chunk);  | 
209  | 0  |             return NULL;  | 
210  | 0  |         }  | 
211  |  |  | 
212  |  |         /* look for NULL chars */  | 
213  | 954k  |         ptr = memchr(start, 0, chunk_len);  | 
214  | 954k  |         if (!ptr) { | 
215  |  |             /* no NULL chars found, copy all */  | 
216  | 295k  |             memcpy(chunk, start, chunk_len);  | 
217  | 295k  |             chunk[chunk_len] = '\0';  | 
218  | 295k  |             m_area->offset += chunk_len;  | 
219  |  |             /* point ptr to end of chunk,  | 
220  |  |              * so we can check and rewind to a space below */  | 
221  | 295k  |             ptr = start + chunk_len;  | 
222  | 658k  |         } else { | 
223  |  |             /* copy portion that doesn't contain NULL chars */  | 
224  | 658k  |             chunk_len = ptr - start;  | 
225  | 658k  |             if (chunk_len < max_len) { | 
226  | 658k  |                 memcpy(chunk, start, chunk_len);  | 
227  | 658k  |             } else { | 
228  | 0  |                 chunk_len = 0;  | 
229  | 0  |                 ptr       = start;  | 
230  | 0  |             }  | 
231  | 658k  |             if (m_area->map)  | 
232  | 658k  |                 ptr = (unsigned char *)fmap_need_ptr_once(m_area->map, ptr, end - ptr);  | 
233  | 658k  |             if (!ptr) { | 
234  | 0  |                 cli_warnmsg("fmap inconsistency\n"); | 
235  | 0  |                 ptr = end;  | 
236  | 0  |             }  | 
237  |  |             /* we have unknown number of NULL chars,  | 
238  |  |              * copy char-by-char and skip them */  | 
239  | 2.78G  |             while ((ptr < end) && (chunk_len < max_len - 1)) { | 
240  | 2.78G  |                 const unsigned char c = *ptr++;  | 
241  |  |                 /* we can't use chunk_len to determine how many bytes we read, since  | 
242  |  |                  * we skipped chars */  | 
243  | 2.78G  |                 if (c) { | 
244  | 2.29G  |                     chunk[chunk_len++] = c;  | 
245  | 2.29G  |                 }  | 
246  | 2.78G  |             }  | 
247  | 658k  |             m_area->offset += ptr - start;  | 
248  | 658k  |             chunk[chunk_len] = '\0';  | 
249  | 658k  |         }  | 
250  | 954k  |         if (ptr && ptr < end && !isspace(*ptr)) { | 
251  |  |             /* we hit max_len, rewind to a space */  | 
252  | 318k  |             count = rewind_tospace(chunk, chunk_len);  | 
253  | 318k  |             if (count < chunk_len) { | 
254  | 250k  |                 chunk[count] = '\0';  | 
255  | 250k  |                 m_area->offset -= chunk_len - count;  | 
256  | 250k  |             }  | 
257  | 318k  |         }  | 
258  | 954k  |     } else { | 
259  | 0  |         if (!stream) { | 
260  | 0  |             cli_dbgmsg("No HTML stream\n"); | 
261  | 0  |             free(chunk);  | 
262  | 0  |             return NULL;  | 
263  | 0  |         }  | 
264  | 0  |         chunk_len = fread(chunk, 1, max_len - 1, stream);  | 
265  | 0  |         if (!chunk_len || chunk_len > max_len - 1) { | 
266  |  |             /* EOF, or prevent overflow */  | 
267  | 0  |             free(chunk);  | 
268  | 0  |             return NULL;  | 
269  | 0  |         }  | 
270  |  |  | 
271  |  |         /* Look for NULL chars */  | 
272  | 0  |         ptr = memchr(chunk, 0, chunk_len);  | 
273  | 0  |         if (ptr) { | 
274  |  |             /* NULL char found */  | 
275  |  |             /* save buffer limits */  | 
276  | 0  |             start = ptr;  | 
277  | 0  |             end   = chunk + chunk_len;  | 
278  |  |  | 
279  |  |             /* start of NULL chars, we will copy non-NULL characters  | 
280  |  |              * to this position */  | 
281  | 0  |             chunk_len = ptr - chunk;  | 
282  |  |  | 
283  |  |             /* find first non-NULL char */  | 
284  | 0  |             while ((ptr < end) && !(*ptr)) { | 
285  | 0  |                 ptr++;  | 
286  | 0  |             }  | 
287  |  |             /* skip over NULL chars, and move back the rest */  | 
288  | 0  |             while ((ptr < end) && (chunk_len < max_len - 1)) { | 
289  | 0  |                 const unsigned char c = *ptr++;  | 
290  | 0  |                 if (c) { | 
291  | 0  |                     chunk[chunk_len++] = c;  | 
292  | 0  |                 }  | 
293  | 0  |             }  | 
294  | 0  |         }  | 
295  | 0  |         chunk[chunk_len] = '\0';  | 
296  | 0  |         if (chunk_len == max_len - 1) { | 
297  |  |             /* rewind to a space (which includes newline) */  | 
298  | 0  |             count = rewind_tospace(chunk, chunk_len);  | 
299  | 0  |             if (count < chunk_len) { | 
300  | 0  |                 chunk[count] = '\0';  | 
301  |  |                 /* seek-back to space */  | 
302  | 0  |                 fseek(stream, -(long)(chunk_len - count), SEEK_CUR);  | 
303  | 0  |             }  | 
304  | 0  |         }  | 
305  | 0  |     }  | 
306  |  |  | 
307  | 954k  |     return chunk;  | 
308  | 1.53M  | }  | 
309  |  |  | 
310  |  | static void html_output_flush(file_buff_t *fbuff)  | 
311  | 930k  | { | 
312  | 930k  |     if (fbuff && (fbuff->length > 0)) { | 
313  | 930k  |         cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);  | 
314  | 930k  |         fbuff->length = 0;  | 
315  | 930k  |     }  | 
316  | 930k  | }  | 
317  |  |  | 
318  |  | static inline void html_output_c(file_buff_t *fbuff1, unsigned char c)  | 
319  | 3.30G  | { | 
320  | 3.30G  |     if (fbuff1) { | 
321  | 3.30G  |         if (fbuff1->length == HTML_FILE_BUFF_LEN) { | 
322  | 301k  |             html_output_flush(fbuff1);  | 
323  | 301k  |         }  | 
324  | 3.30G  |         fbuff1->buffer[fbuff1->length++] = c;  | 
325  | 3.30G  |     }  | 
326  | 3.30G  | }  | 
327  |  |  | 
328  |  | static void html_output_str(file_buff_t *fbuff, const unsigned char *str, size_t len)  | 
329  | 1.24M  | { | 
330  | 1.24M  |     if (fbuff) { | 
331  | 1.24M  |         if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) { | 
332  | 2.49k  |             html_output_flush(fbuff);  | 
333  | 2.49k  |         }  | 
334  | 1.24M  |         if (len >= HTML_FILE_BUFF_LEN) { | 
335  | 0  |             html_output_flush(fbuff);  | 
336  | 0  |             cli_writen(fbuff->fd, str, len);  | 
337  | 1.24M  |         } else { | 
338  | 1.24M  |             memcpy(fbuff->buffer + fbuff->length, str, len);  | 
339  | 1.24M  |             fbuff->length += len;  | 
340  | 1.24M  |         }  | 
341  | 1.24M  |     }  | 
342  | 1.24M  | }  | 
343  |  |  | 
344  |  | static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag)  | 
345  | 1.87M  | { | 
346  | 1.87M  |     int i;  | 
347  |  |  | 
348  | 3.80M  |     for (i = 0; i < tags->count; i++) { | 
349  | 3.01M  |         if (strcmp((const char *)tags->tag[i], tag) == 0) { | 
350  | 1.08M  |             return (char *)tags->value[i];  | 
351  | 1.08M  |         }  | 
352  | 3.01M  |     }  | 
353  | 793k  |     return NULL;  | 
354  | 1.87M  | }  | 
355  |  |  | 
356  |  | static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value)  | 
357  | 14.7k  | { | 
358  | 14.7k  |     int i;  | 
359  |  |  | 
360  | 104k  |     for (i = 0; i < tags->count; i++) { | 
361  | 104k  |         if (strcmp((const char *)tags->tag[i], tag) == 0) { | 
362  | 14.7k  |             free(tags->value[i]);  | 
363  | 14.7k  |             tags->value[i] = (unsigned char *)cli_safer_strdup(value);  | 
364  | 14.7k  |             return;  | 
365  | 14.7k  |         }  | 
366  | 104k  |     }  | 
367  | 0  |     return;  | 
368  | 14.7k  | }  | 
369  |  | void html_tag_arg_add(tag_arguments_t *tags,  | 
370  |  |                       const char *tag, char *value)  | 
371  | 7.83M  | { | 
372  | 7.83M  |     int len, i;  | 
373  | 7.83M  |     tags->count++;  | 
374  | 7.83M  |     tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag,  | 
375  | 7.83M  |                                                           tags->count * sizeof(char *));  | 
376  | 7.83M  |     if (!tags->tag) { | 
377  | 0  |         goto done;  | 
378  | 0  |     }  | 
379  | 7.83M  |     tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value,  | 
380  | 7.83M  |                                                             tags->count * sizeof(char *));  | 
381  | 7.83M  |     if (!tags->value) { | 
382  | 0  |         goto done;  | 
383  | 0  |     }  | 
384  | 7.83M  |     if (tags->scanContents) { | 
385  | 0  |         tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents,  | 
386  | 0  |                                                                    tags->count * sizeof(*tags->contents));  | 
387  | 0  |         if (!tags->contents) { | 
388  | 0  |             goto done;  | 
389  | 0  |         }  | 
390  | 0  |         tags->contents[tags->count - 1] = NULL;  | 
391  | 0  |     }  | 
392  | 7.83M  |     tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag);  | 
393  | 7.83M  |     if (value) { | 
394  | 2.39M  |         if (*value == '"') { | 
395  | 672k  |             tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1);  | 
396  | 672k  |             len                          = strlen((const char *)value + 1);  | 
397  | 672k  |             if (len > 0) { | 
398  | 668k  |                 tags->value[tags->count - 1][len - 1] = '\0';  | 
399  | 668k  |             }  | 
400  | 1.72M  |         } else { | 
401  | 1.72M  |             tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value);  | 
402  | 1.72M  |         }  | 
403  | 5.44M  |     } else { | 
404  | 5.44M  |         tags->value[tags->count - 1] = NULL;  | 
405  | 5.44M  |     }  | 
406  | 7.83M  |     return;  | 
407  |  |  | 
408  | 0  | done:  | 
409  |  |     /* Bad error - can't do 100% recovery */  | 
410  | 0  |     tags->count--;  | 
411  | 0  |     for (i = 0; i < tags->count; i++) { | 
412  | 0  |         if (tags->tag) { | 
413  | 0  |             free(tags->tag[i]);  | 
414  | 0  |         }  | 
415  | 0  |         if (tags->value) { | 
416  | 0  |             free(tags->value[i]);  | 
417  | 0  |         }  | 
418  | 0  |         if (tags->contents) { | 
419  | 0  |             if (tags->contents[i])  | 
420  | 0  |                 free(tags->contents[i]);  | 
421  | 0  |         }  | 
422  | 0  |     }  | 
423  | 0  |     if (tags->tag) { | 
424  | 0  |         free(tags->tag);  | 
425  | 0  |     }  | 
426  | 0  |     if (tags->value) { | 
427  | 0  |         free(tags->value);  | 
428  | 0  |     }  | 
429  | 0  |     if (tags->contents)  | 
430  | 0  |         free(tags->contents);  | 
431  | 0  |     tags->contents = NULL;  | 
432  | 0  |     tags->tag = tags->value = NULL;  | 
433  | 0  |     tags->count             = 0;  | 
434  | 0  |     return;  | 
435  | 7.83M  | }  | 
436  |  |  | 
437  |  | static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)  | 
438  | 14.7k  | { | 
439  | 14.7k  |     int i, j, len;  | 
440  |  |  | 
441  | 14.7k  |     html_output_c(fbuff, '<');  | 
442  | 14.7k  |     html_output_str(fbuff, (const unsigned char *)tag, strlen(tag));  | 
443  | 124k  |     for (i = 0; i < tags->count; i++) { | 
444  | 110k  |         html_output_c(fbuff, ' ');  | 
445  | 110k  |         html_output_str(fbuff, tags->tag[i], strlen((const char *)tags->tag[i]));  | 
446  | 110k  |         if (tags->value[i]) { | 
447  | 41.8k  |             html_output_str(fbuff, (const unsigned char *)"=\"", 2);  | 
448  | 41.8k  |             len = strlen((const char *)tags->value[i]);  | 
449  | 773k  |             for (j = 0; j < len; j++) { | 
450  | 731k  |                 html_output_c(fbuff, tolower(tags->value[i][j]));  | 
451  | 731k  |             }  | 
452  | 41.8k  |             html_output_c(fbuff, '"');  | 
453  | 41.8k  |         }  | 
454  | 110k  |     }  | 
455  | 14.7k  |     html_output_c(fbuff, '>');  | 
456  | 14.7k  | }  | 
457  |  |  | 
458  |  | void html_tag_arg_free(tag_arguments_t *tags)  | 
459  | 9.86M  | { | 
460  | 9.86M  |     int i;  | 
461  |  |  | 
462  | 17.7M  |     for (i = 0; i < tags->count; i++) { | 
463  | 7.83M  |         free(tags->tag[i]);  | 
464  | 7.83M  |         if (tags->value[i]) { | 
465  | 2.39M  |             free(tags->value[i]);  | 
466  | 2.39M  |         }  | 
467  | 7.83M  |         if (tags->contents)  | 
468  | 0  |             if (tags->contents[i])  | 
469  | 0  |                 free(tags->contents[i]);  | 
470  | 7.83M  |     }  | 
471  | 9.86M  |     if (tags->tag) { | 
472  | 2.52M  |         free(tags->tag);  | 
473  | 2.52M  |     }  | 
474  | 9.86M  |     if (tags->value) { | 
475  | 2.52M  |         free(tags->value);  | 
476  | 2.52M  |     }  | 
477  | 9.86M  |     if (tags->contents)  | 
478  | 0  |         free(tags->contents);  | 
479  | 9.86M  |     tags->contents = NULL;  | 
480  | 9.86M  |     tags->tag = tags->value = NULL;  | 
481  | 9.86M  |     tags->count             = 0;  | 
482  | 9.86M  | }  | 
483  |  |  | 
484  |  | /**  | 
485  |  |  * the displayed text for an <a href> tag  | 
486  |  |  */  | 
487  |  | static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char *begin, const unsigned char *end)  | 
488  | 0  | { | 
489  | 0  |     size_t i;  | 
490  | 0  |     uint32_t mbchar = 0;  | 
491  | 0  |     if (!begin || !end)  | 
492  | 0  |         return;  | 
493  | 0  |     for (i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end); i++) { | 
494  | 0  |         uint8_t c = *begin++;  | 
495  | 0  |         if (mbchar && (c < 0x80 || mbchar >= 0x10000)) { | 
496  | 0  |             if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||  | 
497  | 0  |                 (mbchar == 0xA1 && (c == 0x43 || c == 0x44 || c == 0x4F))) { | 
498  | 0  |                 cont->contents[i++] = '.';  | 
499  | 0  |                 if (mbchar == 0xA1) { | 
500  | 0  |                     --i;  | 
501  | 0  |                     mbchar = 0;  | 
502  | 0  |                     continue;  | 
503  | 0  |                 }  | 
504  | 0  |             } else { | 
505  | 0  |                 uint8_t c0 = mbchar >> 16;  | 
506  | 0  |                 uint8_t c1 = (mbchar >> 8) & 0xff;  | 
507  | 0  |                 uint8_t c2 = (mbchar & 0xff);  | 
508  | 0  |                 if (c0 && i + 1 < MAX_TAG_CONTENTS_LENGTH)  | 
509  | 0  |                     cont->contents[i++] = c0;  | 
510  | 0  |                 if ((c0 || c1) && i + 1 < MAX_TAG_CONTENTS_LENGTH)  | 
511  | 0  |                     cont->contents[i++] = c1;  | 
512  | 0  |                 if (i + 1 < MAX_TAG_CONTENTS_LENGTH)  | 
513  | 0  |                     cont->contents[i++] = c2;  | 
514  | 0  |             }  | 
515  | 0  |             mbchar = 0;  | 
516  | 0  |         }  | 
517  | 0  |         if (c >= 0x80) { | 
518  | 0  |             mbchar = (mbchar << 8) | c;  | 
519  | 0  |             --i;  | 
520  | 0  |         } else  | 
521  | 0  |             cont->contents[i] = c;  | 
522  | 0  |     }  | 
523  | 0  |     cont->pos = i;  | 
524  | 0  | }  | 
525  |  |  | 
526  |  | static inline void html_tag_contents_done(tag_arguments_t *tags, int idx, struct tag_contents *cont)  | 
527  | 0  | { | 
528  | 0  |     unsigned char *p;  | 
529  | 0  |     cont->contents[cont->pos++] = '\0';  | 
530  | 0  |     p                           = cli_max_malloc(cont->pos);  | 
531  | 0  |     if (!p) { | 
532  | 0  |         cli_errmsg("html_tag_contents_done: Unable to allocate memory for p\n"); | 
533  | 0  |         return;  | 
534  | 0  |     }  | 
535  | 0  |     memcpy(p, cont->contents, cont->pos);  | 
536  | 0  |     tags->contents[idx - 1] = p;  | 
537  | 0  |     cont->pos               = 0;  | 
538  | 0  | }  | 
539  |  |  | 
540  |  | struct screnc_state { | 
541  |  |     uint32_t length;  | 
542  |  |     uint32_t sum;  | 
543  |  |     uint8_t table_pos;  | 
544  |  | };  | 
545  |  |  | 
546  |  | /* inplace decoding, so that we can normalize it later */  | 
547  |  | static void screnc_decode(unsigned char *ptr, struct screnc_state *s)  | 
548  | 688k  | { | 
549  | 688k  |     uint8_t value;  | 
550  | 688k  |     unsigned char *dst = ptr;  | 
551  |  |  | 
552  | 688k  |     if (!ptr || !s)  | 
553  | 0  |         return;  | 
554  | 1.11G  |     while (s->length > 0 && *ptr) { | 
555  | 1.11G  |         if ((*ptr == '\n') || (*ptr == '\r')) { | 
556  | 10.2M  |             ptr++;  | 
557  | 10.2M  |             continue;  | 
558  | 10.2M  |         }  | 
559  | 1.10G  |         if (*ptr < 0x80) { | 
560  | 736M  |             value = decrypt_tables[table_order[s->table_pos]][*ptr];  | 
561  | 736M  |             if (value == 0xFF) { /* special character */ | 
562  | 6.71M  |                 ptr++;  | 
563  | 6.71M  |                 s->length--;  | 
564  | 6.71M  |                 switch (*ptr) { | 
565  | 8.63k  |                     case '\0':  | 
566  |  |                         /* Fixup for end of line */  | 
567  | 8.63k  |                         ptr--;  | 
568  | 8.63k  |                         break;  | 
569  | 61.0k  |                     case 0x21:  | 
570  | 61.0k  |                         value = 0x3c;  | 
571  | 61.0k  |                         break;  | 
572  | 165k  |                     case 0x23:  | 
573  | 165k  |                         value = 0x0d;  | 
574  | 165k  |                         break;  | 
575  | 61.0k  |                     case 0x24:  | 
576  | 61.0k  |                         value = 0x40;  | 
577  | 61.0k  |                         break;  | 
578  | 45.4k  |                     case 0x26:  | 
579  | 45.4k  |                         value = 0x0a;  | 
580  | 45.4k  |                         break;  | 
581  | 40.1k  |                     case 0x2a:  | 
582  | 40.1k  |                         value = 0x3e;  | 
583  | 40.1k  |                         break;  | 
584  | 6.71M  |                 }  | 
585  | 6.71M  |             }  | 
586  | 736M  |             s->sum += value;  | 
587  | 736M  |             *dst++       = value;  | 
588  | 736M  |             s->table_pos = (s->table_pos + 1) % 64;  | 
589  | 736M  |         } else { | 
590  | 365M  |             *dst++ = *ptr++;  | 
591  | 365M  |             *dst++ = *ptr;  | 
592  | 365M  |             if (!*ptr) { | 
593  | 81.3k  |                 dst--;  | 
594  | 81.3k  |                 break;  | 
595  | 81.3k  |             }  | 
596  | 365M  |         }  | 
597  | 1.10G  |         ptr++;  | 
598  | 1.10G  |         s->length--;  | 
599  | 1.10G  |     }  | 
600  | 688k  |     if (!s->length) { | 
601  | 238k  |         size_t remaining;  | 
602  | 238k  |         if (strlen((const char *)ptr) >= 12) { | 
603  | 232k  |             uint64_t expected;  | 
604  | 232k  |             expected = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;  | 
605  | 232k  |             expected += base64_chars[ptr[1]] >> 4;  | 
606  | 232k  |             expected += (base64_chars[ptr[1]] & 0x0f) << 12;  | 
607  | 232k  |             expected += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;  | 
608  | 232k  |             expected += (base64_chars[ptr[2]] & 0x03) << 22;  | 
609  | 232k  |             expected += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;  | 
610  | 232k  |             expected += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;  | 
611  | 232k  |             expected += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;  | 
612  | 232k  |             ptr += 8;  | 
613  | 232k  |             if (s->sum != expected) { | 
614  | 226k  |                 cli_dbgmsg("screnc_decode: checksum mismatch: %u != %" PRIu64 "\n", s->sum, expected); | 
615  | 226k  |             } else { | 
616  | 5.73k  |                 if (strncmp((const char *)ptr, "^#~@", 4) != 0) { | 
617  | 4.59k  |                     cli_dbgmsg("screnc_decode: terminator not found\n"); | 
618  | 4.59k  |                 } else { | 
619  | 1.14k  |                     cli_dbgmsg("screnc_decode: OK\n"); | 
620  | 1.14k  |                 }  | 
621  | 5.73k  |             }  | 
622  | 232k  |             ptr += 4;  | 
623  | 232k  |         }  | 
624  |  |         /* copy remaining */  | 
625  | 238k  |         remaining = strlen((const char *)ptr) + 1;  | 
626  | 238k  |         memmove(dst, ptr, remaining);  | 
627  | 450k  |     } else { | 
628  | 450k  |         *dst = '\0';  | 
629  | 450k  |     }  | 
630  | 688k  | }  | 
631  |  |  | 
632  |  | static void js_process(struct parser_state *js_state, const unsigned char *js_begin, const unsigned char *js_end,  | 
633  |  |                        const unsigned char *line, const unsigned char *ptr, tag_type in_tag, const char *dirname)  | 
634  | 640k  | { | 
635  | 640k  |     if (!js_begin)  | 
636  | 275k  |         js_begin = line;  | 
637  | 640k  |     if (!js_end)  | 
638  | 455k  |         js_end = ptr;  | 
639  | 640k  |     if (js_end > js_begin &&  | 
640  | 640k  |         CLI_ISCONTAINED(line, 8192, js_begin, 1) &&  | 
641  | 640k  |         CLI_ISCONTAINED(line, 8192, js_end, 1)) { | 
642  | 639k  |         cli_js_process_buffer(js_state, (const char *)js_begin, js_end - js_begin);  | 
643  | 639k  |     }  | 
644  | 640k  |     if (in_tag == TAG_DONT_EXTRACT) { | 
645  |  |         /*  we found a /script, normalize script now */  | 
646  | 184k  |         cli_js_parse_done(js_state);  | 
647  | 184k  |         cli_js_output(js_state, dirname);  | 
648  | 184k  |         cli_js_destroy(js_state);  | 
649  | 184k  |     }  | 
650  | 640k  | }  | 
651  |  |  | 
652  |  | static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)  | 
653  | 259k  | { | 
654  | 259k  |     int fd_tmp, tag_length = 0, tag_arg_length = 0;  | 
655  | 259k  |     bool binary, retval = false, escape = false, hex = false;  | 
656  | 259k  |     int64_t value = 0, tag_val_length = 0;  | 
657  | 259k  |     bool look_for_screnc = false, in_screnc = false, text_space_written = false;  | 
658  | 259k  |     tag_type in_tag  = TAG_DONT_EXTRACT;  | 
659  | 259k  |     FILE *stream_in  = NULL;  | 
660  | 259k  |     html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;  | 
661  | 259k  |     char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];  | 
662  | 259k  |     char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;  | 
663  | 259k  |     unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;  | 
664  | 259k  |     tag_arguments_t tag_args;  | 
665  | 259k  |     quoted_state quoted  = NOT_QUOTED;  | 
666  | 259k  |     unsigned long length = 0;  | 
667  | 259k  |     struct screnc_state screnc_state;  | 
668  | 259k  |     file_buff_t *file_buff_o2, *file_buff_text;  | 
669  | 259k  |     file_buff_t *file_tmp_o1           = NULL;  | 
670  | 259k  |     int in_ahref                       = 0;    /* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/  | 
671  | 259k  |     unsigned char *href_contents_begin = NULL; /*beginning of the next portion of <a> contents*/  | 
672  | 259k  |     unsigned char *ptrend              = NULL; /*end of <a> contents*/  | 
673  | 259k  |     unsigned char *in_form_action      = NULL; /* the action URL of the current <form> tag, if any*/  | 
674  |  |  | 
675  | 259k  |     struct entity_conv conv;  | 
676  | 259k  |     unsigned char entity_val[HTML_STR_LENGTH + 1];  | 
677  | 259k  |     size_t entity_val_length = 0;  | 
678  | 259k  |     const int dconf_entconv  = dconf ? dconf->phishing & PHISHING_CONF_ENTCONV : 1;  | 
679  | 259k  |     const int dconf_js       = dirname && (dconf ? dconf->doc & DOC_CONF_JSNORM : 1); /* TODO */  | 
680  |  |     /* dconf for phishing engine sets scanContents, so no need for a flag here */  | 
681  | 259k  |     struct parser_state *js_state = NULL;  | 
682  | 259k  |     const unsigned char *js_begin = NULL, *js_end = NULL;  | 
683  | 259k  |     uint8_t *style_buff              = NULL;  | 
684  | 259k  |     size_t style_buff_size           = 0;  | 
685  | 259k  |     const unsigned char *style_begin = NULL, *style_end = NULL;  | 
686  | 259k  |     struct tag_contents contents;  | 
687  | 259k  |     uint32_t mbchar  = 0;  | 
688  | 259k  |     uint32_t mbchar2 = 0;  | 
689  |  |  | 
690  |  |     /*  | 
691  |  |      * Initialize stack buffers.  | 
692  |  |      */  | 
693  | 259k  |     memset(filename, 0, sizeof(filename));  | 
694  | 259k  |     memset(tag, 0, sizeof(tag));  | 
695  | 259k  |     memset(tag_arg, 0, sizeof(tag_arg));  | 
696  | 259k  |     memset(tag_val, 0, sizeof(tag_val));  | 
697  | 259k  |     memset(entity_val, 0, sizeof(entity_val));  | 
698  |  |  | 
699  | 259k  |     tag_args.scanContents = 0; /* do we need to store the contents of <a></a>?*/  | 
700  | 259k  |     contents.pos          = 0;  | 
701  | 259k  |     if (!m_area) { | 
702  | 0  |         if (fd < 0) { | 
703  | 0  |             cli_dbgmsg("Invalid HTML fd\n"); | 
704  | 0  |             return false;  | 
705  | 0  |         }  | 
706  | 0  |         lseek(fd, 0, SEEK_SET);  | 
707  | 0  |         fd_tmp = dup(fd);  | 
708  | 0  |         if (fd_tmp < 0) { | 
709  | 0  |             return false;  | 
710  | 0  |         }  | 
711  | 0  |         stream_in = fdopen(fd_tmp, "r");  | 
712  | 0  |         if (!stream_in) { | 
713  | 0  |             close(fd_tmp);  | 
714  | 0  |             return false;  | 
715  | 0  |         }  | 
716  | 0  |     }  | 
717  |  |  | 
718  | 259k  |     tag_args.count    = 0;  | 
719  | 259k  |     tag_args.tag      = NULL;  | 
720  | 259k  |     tag_args.value    = NULL;  | 
721  | 259k  |     tag_args.contents = NULL;  | 
722  | 259k  |     if (dirname) { | 
723  | 259k  |         file_buff_o2 = (file_buff_t *)malloc(sizeof(file_buff_t));  | 
724  | 259k  |         if (!file_buff_o2) { | 
725  | 0  |             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_o2\n"); | 
726  | 0  |             file_buff_o2 = file_buff_text = NULL;  | 
727  | 0  |             goto done;  | 
728  | 0  |         }  | 
729  |  |  | 
730  |  |         /* this will still contains scripts that are inside comments */  | 
731  | 259k  |         snprintf(filename, 1024, "%s" PATHSEP "nocomment.html", dirname);  | 
732  | 259k  |         file_buff_o2->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);  | 
733  | 259k  |         if (file_buff_o2->fd == -1) { | 
734  | 0  |             cli_dbgmsg("open failed: %s\n", filename); | 
735  | 0  |             free(file_buff_o2);  | 
736  | 0  |             file_buff_o2 = file_buff_text = NULL;  | 
737  | 0  |             goto done;  | 
738  | 0  |         }  | 
739  |  |  | 
740  | 259k  |         file_buff_text = (file_buff_t *)malloc(sizeof(file_buff_t));  | 
741  | 259k  |         if (!file_buff_text) { | 
742  | 0  |             close(file_buff_o2->fd);  | 
743  | 0  |             free(file_buff_o2);  | 
744  | 0  |             file_buff_o2 = file_buff_text = NULL;  | 
745  | 0  |             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_text\n"); | 
746  | 0  |             goto done;  | 
747  | 0  |         }  | 
748  |  |  | 
749  | 259k  |         snprintf(filename, 1024, "%s" PATHSEP "notags.html", dirname);  | 
750  | 259k  |         file_buff_text->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);  | 
751  | 259k  |         if (file_buff_text->fd == -1) { | 
752  | 0  |             cli_dbgmsg("open failed: %s\n", filename); | 
753  | 0  |             close(file_buff_o2->fd);  | 
754  | 0  |             free(file_buff_o2);  | 
755  | 0  |             free(file_buff_text);  | 
756  | 0  |             file_buff_o2 = file_buff_text = NULL;  | 
757  | 0  |             goto done;  | 
758  | 0  |         }  | 
759  | 259k  |         file_buff_o2->length   = 0;  | 
760  | 259k  |         file_buff_text->length = 0;  | 
761  | 259k  |     } else { | 
762  | 0  |         file_buff_o2   = NULL;  | 
763  | 0  |         file_buff_text = NULL;  | 
764  | 0  |     }  | 
765  |  |  | 
766  | 259k  |     binary = false;  | 
767  |  |  | 
768  | 259k  |     ptr = line = cli_readchunk(stream_in, m_area, 8192);  | 
769  |  |  | 
770  | 959k  |     while (line) { | 
771  | 700k  |         if (href_contents_begin)  | 
772  | 0  |             href_contents_begin = ptr; /*start of a new line, last line already appended to contents see below*/  | 
773  | 1.03M  |         while (*ptr && isspace(*ptr)) { | 
774  | 337k  |             ptr++;  | 
775  | 337k  |         }  | 
776  | 3.20G  |         while (*ptr) { | 
777  | 3.20G  |             if (!binary && *ptr == '\n') { | 
778  |  |                 /* Convert it to a space and re-process */  | 
779  | 14.7M  |                 *ptr = ' ';  | 
780  | 14.7M  |                 continue;  | 
781  | 14.7M  |             }  | 
782  | 3.19G  |             if (!binary && *ptr == '\r') { | 
783  | 5.27M  |                 ptr++;  | 
784  | 5.27M  |                 continue;  | 
785  | 5.27M  |             }  | 
786  | 3.18G  |             switch (state) { | 
787  | 0  |                 case HTML_SPECIAL_CHAR:  | 
788  | 0  |                     cli_dbgmsg("Impossible, special_char can't occur here\n"); | 
789  | 0  |                     break;  | 
790  | 0  |                 case HTML_BAD_STATE:  | 
791  |  |                     /* An engine error has occurred */  | 
792  | 0  |                     cli_dbgmsg("HTML Engine Error\n"); | 
793  | 0  |                     goto done;  | 
794  | 81.7M  |                 case HTML_SKIP_WS:  | 
795  | 81.7M  |                     if (isspace(*ptr)) { | 
796  | 15.0M  |                         ptr++;  | 
797  | 66.6M  |                     } else { | 
798  | 66.6M  |                         state      = next_state;  | 
799  | 66.6M  |                         next_state = HTML_BAD_STATE;  | 
800  | 66.6M  |                     }  | 
801  | 81.7M  |                     break;  | 
802  | 156M  |                 case HTML_TRIM_WS:  | 
803  | 156M  |                     if (isspace(*ptr)) { | 
804  | 93.7M  |                         ptr++;  | 
805  | 93.7M  |                     } else { | 
806  | 62.6M  |                         if (in_tag == TAG_DONT_EXTRACT) { | 
807  | 9.52M  |                             html_output_c(file_buff_o2, ' ');  | 
808  | 9.52M  |                         }  | 
809  | 62.6M  |                         state      = next_state;  | 
810  | 62.6M  |                         next_state = HTML_BAD_STATE;  | 
811  | 62.6M  |                     }  | 
812  | 156M  |                     break;  | 
813  | 420M  |                 case HTML_8BIT:  | 
814  | 420M  |                     if (*ptr < 0x80 || mbchar >= 0x10000) { | 
815  | 190M  |                         if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||  | 
816  | 190M  |                             (mbchar == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) { | 
817  |  |                             /* bb #4097 */  | 
818  | 481k  |                             html_output_c(file_buff_o2, '.');  | 
819  | 481k  |                             html_output_c(file_buff_text, '.');  | 
820  | 481k  |                             if (mbchar == 0xA1) { | 
821  | 330k  |                                 ptr++;  | 
822  | 330k  |                                 mbchar = 0;  | 
823  | 330k  |                                 continue;  | 
824  | 330k  |                             }  | 
825  | 189M  |                         } else { | 
826  | 189M  |                             uint8_t c0 = mbchar >> 16;  | 
827  | 189M  |                             uint8_t c1 = (mbchar >> 8) & 0xff;  | 
828  | 189M  |                             uint8_t c2 = (mbchar & 0xff);  | 
829  | 189M  |                             if (c0) { | 
830  | 101M  |                                 html_output_c(file_buff_o2, c0);  | 
831  | 101M  |                                 html_output_c(file_buff_text, c0);  | 
832  | 101M  |                             }  | 
833  | 189M  |                             if (c0 || c1) { | 
834  | 127M  |                                 html_output_c(file_buff_o2, c1);  | 
835  | 127M  |                                 html_output_c(file_buff_text, c1);  | 
836  | 127M  |                             }  | 
837  | 189M  |                             html_output_c(file_buff_o2, c2);  | 
838  | 189M  |                             html_output_c(file_buff_text, c1);  | 
839  | 189M  |                         }  | 
840  | 190M  |                         mbchar     = 0;  | 
841  | 190M  |                         state      = next_state;  | 
842  | 190M  |                         next_state = HTML_NORM;  | 
843  | 230M  |                     } else { | 
844  | 230M  |                         mbchar = (mbchar << 8) | *ptr;  | 
845  | 230M  |                         ptr++;  | 
846  | 230M  |                     }  | 
847  | 420M  |                     break;  | 
848  | 1.99G  |                 case HTML_NORM:  | 
849  | 1.99G  |                     if (*ptr == '<') { | 
850  | 37.0M  |                         ptrend = ptr; /* for use by scanContents */  | 
851  | 37.0M  |                         html_output_c(file_buff_o2, '<');  | 
852  | 37.0M  |                         if (in_tag == TAG_DONT_EXTRACT && !text_space_written) { | 
853  | 3.61M  |                             html_output_c(file_buff_text, ' ');  | 
854  | 3.61M  |                             text_space_written = true;  | 
855  | 3.61M  |                         }  | 
856  | 37.0M  |                         if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin) { | 
857  |  |                             /*append this text portion to the contents of <a>*/  | 
858  | 0  |                             html_tag_contents_append(&contents, href_contents_begin, ptr);  | 
859  | 0  |                             href_contents_begin = NULL; /*We just encountered another tag inside <a>, so skip it*/  | 
860  | 0  |                         }  | 
861  | 37.0M  |                         ptr++;  | 
862  | 37.0M  |                         state      = HTML_SKIP_WS;  | 
863  | 37.0M  |                         tag_length = 0;  | 
864  | 37.0M  |                         next_state = HTML_TAG;  | 
865  | 1.96G  |                     } else if (isspace(*ptr)) { | 
866  | 62.7M  |                         if (!text_space_written && in_tag == TAG_DONT_EXTRACT) { | 
867  | 8.42M  |                             html_output_c(file_buff_text, ' ');  | 
868  | 8.42M  |                             text_space_written = true;  | 
869  | 8.42M  |                         }  | 
870  | 62.7M  |                         state      = HTML_TRIM_WS;  | 
871  | 62.7M  |                         next_state = HTML_NORM;  | 
872  | 1.89G  |                     } else if (*ptr == '&') { | 
873  | 9.78M  |                         if (!text_space_written && in_tag == TAG_DONT_EXTRACT) { | 
874  | 773k  |                             html_output_c(file_buff_text, ' ');  | 
875  | 773k  |                             text_space_written = true;  | 
876  | 773k  |                         }  | 
877  | 9.78M  |                         state      = HTML_CHAR_REF;  | 
878  | 9.78M  |                         next_state = HTML_NORM;  | 
879  | 9.78M  |                         ptr++;  | 
880  | 1.89G  |                     } else if (*ptr >= 0x80) { | 
881  | 190M  |                         state      = HTML_8BIT;  | 
882  | 190M  |                         next_state = HTML_NORM;  | 
883  | 190M  |                         mbchar     = *ptr;  | 
884  | 190M  |                         ptr++;  | 
885  | 1.69G  |                     } else { | 
886  | 1.69G  |                         unsigned char c = tolower(*ptr);  | 
887  |  |                         /* normalize ' to " for scripts */  | 
888  | 1.69G  |                         if (in_tag != TAG_DONT_EXTRACT && c == '\'') c = '"';  | 
889  | 1.69G  |                         html_output_c(file_buff_o2, c);  | 
890  | 1.69G  |                         if (in_tag == TAG_DONT_EXTRACT) { | 
891  | 238M  |                             if (*ptr < 0x20) { | 
892  | 23.2M  |                                 if (!text_space_written) { | 
893  | 11.0M  |                                     html_output_c(file_buff_text, ' ');  | 
894  | 11.0M  |                                     text_space_written = true;  | 
895  | 11.0M  |                                 }  | 
896  | 215M  |                             } else { | 
897  | 215M  |                                 html_output_c(file_buff_text, c);  | 
898  | 215M  |                                 text_space_written = false;  | 
899  | 215M  |                             }  | 
900  | 238M  |                         }  | 
901  | 1.69G  |                         ptr++;  | 
902  | 1.69G  |                     }  | 
903  | 1.99G  |                     break;  | 
904  | 152M  |                 case HTML_TAG:  | 
905  | 152M  |                     if ((tag_length == 0) && (*ptr == '!')) { | 
906  |  |                         /* Comment */  | 
907  | 526k  |                         if (in_tag != TAG_DONT_EXTRACT) { | 
908  |  |                             /* we still write scripts to nocomment.html */  | 
909  | 474k  |                             html_output_c(file_buff_o2, '!');  | 
910  | 474k  |                         } else { | 
911  |  |                             /* Need to rewind in the no-comment output stream */  | 
912  | 52.3k  |                             if (file_buff_o2 && (file_buff_o2->length > 0)) { | 
913  | 52.3k  |                                 file_buff_o2->length--;  | 
914  | 52.3k  |                             }  | 
915  | 52.3k  |                         }  | 
916  | 526k  |                         state      = HTML_COMMENT;  | 
917  | 526k  |                         next_state = HTML_BAD_STATE;  | 
918  | 526k  |                         ptr++;  | 
919  | 152M  |                     } else if (*ptr == '>') { | 
920  | 7.06M  |                         html_output_c(file_buff_o2, '>');  | 
921  | 7.06M  |                         ptr++;  | 
922  | 7.06M  |                         tag[tag_length] = '\0';  | 
923  | 7.06M  |                         state           = HTML_SKIP_WS;  | 
924  | 7.06M  |                         next_state      = HTML_PROCESS_TAG;  | 
925  | 144M  |                     } else if (!isspace(*ptr)) { | 
926  | 142M  |                         html_output_c(file_buff_o2, tolower(*ptr));  | 
927  |  |                         /* if we're inside a script we only care for </script>.*/  | 
928  | 142M  |                         if (in_tag != TAG_DONT_EXTRACT && tag_length == 0 && *ptr != '/') { | 
929  | 26.7M  |                             state = HTML_NORM;  | 
930  | 26.7M  |                         }  | 
931  | 142M  |                         if (tag_length < HTML_STR_LENGTH) { | 
932  | 139M  |                             tag[tag_length++] = tolower(*ptr);  | 
933  | 139M  |                         }  | 
934  | 142M  |                         ptr++;  | 
935  | 142M  |                     } else { | 
936  | 2.69M  |                         tag[tag_length] = '\0';  | 
937  | 2.69M  |                         state           = HTML_SKIP_WS;  | 
938  | 2.69M  |                         tag_arg_length  = 0;  | 
939  |  |                         /* if we'd go to HTML_TAG_ARG whitespace would be inconsistently normalized for in_tag*/  | 
940  | 2.69M  |                         next_state = in_tag == TAG_DONT_EXTRACT ? HTML_TAG_ARG : HTML_PROCESS_TAG;  | 
941  | 2.69M  |                     }  | 
942  | 152M  |                     break;  | 
943  | 119M  |                 case HTML_TAG_ARG:  | 
944  | 119M  |                     if (*ptr == '=') { | 
945  | 2.39M  |                         html_output_c(file_buff_o2, '=');  | 
946  | 2.39M  |                         tag_arg[tag_arg_length] = '\0';  | 
947  | 2.39M  |                         ptr++;  | 
948  | 2.39M  |                         state          = HTML_SKIP_WS;  | 
949  | 2.39M  |                         escape         = false;  | 
950  | 2.39M  |                         quoted         = NOT_QUOTED;  | 
951  | 2.39M  |                         tag_val_length = 0;  | 
952  | 2.39M  |                         next_state     = HTML_TAG_ARG_VAL;  | 
953  | 117M  |                     } else if (isspace(*ptr)) { | 
954  | 4.35M  |                         ptr++;  | 
955  | 4.35M  |                         tag_arg[tag_arg_length] = '\0';  | 
956  | 4.35M  |                         state                   = HTML_SKIP_WS;  | 
957  | 4.35M  |                         next_state              = HTML_TAG_ARG_EQUAL;  | 
958  | 113M  |                     } else if (*ptr == '>') { | 
959  | 2.50M  |                         html_output_c(file_buff_o2, '>');  | 
960  | 2.50M  |                         if (tag_arg_length > 0) { | 
961  | 1.24M  |                             tag_arg[tag_arg_length] = '\0';  | 
962  | 1.24M  |                             html_tag_arg_add(&tag_args, tag_arg, NULL);  | 
963  | 1.24M  |                         }  | 
964  | 2.50M  |                         ptr++;  | 
965  | 2.50M  |                         state      = HTML_PROCESS_TAG;  | 
966  | 2.50M  |                         next_state = HTML_BAD_STATE;  | 
967  | 110M  |                     } else { | 
968  | 110M  |                         if (tag_arg_length == 0) { | 
969  |  |                             /* Start of new tag - add space */  | 
970  | 7.82M  |                             html_output_c(file_buff_o2, ' ');  | 
971  | 7.82M  |                         }  | 
972  | 110M  |                         html_output_c(file_buff_o2, tolower(*ptr));  | 
973  | 110M  |                         if (tag_arg_length < HTML_STR_LENGTH) { | 
974  | 107M  |                             tag_arg[tag_arg_length++] = tolower(*ptr);  | 
975  | 107M  |                         }  | 
976  | 110M  |                         ptr++;  | 
977  | 110M  |                     }  | 
978  | 119M  |                     break;  | 
979  | 4.34M  |                 case HTML_TAG_ARG_EQUAL:  | 
980  | 4.34M  |                     if (*ptr == '=') { | 
981  | 151k  |                         html_output_c(file_buff_o2, '=');  | 
982  | 151k  |                         ptr++;  | 
983  | 151k  |                         state          = HTML_SKIP_WS;  | 
984  | 151k  |                         escape         = false;  | 
985  | 151k  |                         quoted         = NOT_QUOTED;  | 
986  | 151k  |                         tag_val_length = 0;  | 
987  | 151k  |                         next_state     = HTML_TAG_ARG_VAL;  | 
988  | 4.19M  |                     } else { | 
989  | 4.19M  |                         if (tag_arg_length > 0) { | 
990  | 4.19M  |                             tag_arg[tag_arg_length] = '\0';  | 
991  | 4.19M  |                             html_tag_arg_add(&tag_args, tag_arg, NULL);  | 
992  | 4.19M  |                         }  | 
993  | 4.19M  |                         tag_arg_length = 0;  | 
994  | 4.19M  |                         state          = HTML_TAG_ARG;  | 
995  | 4.19M  |                         next_state     = HTML_BAD_STATE;  | 
996  | 4.19M  |                     }  | 
997  | 4.34M  |                     break;  | 
998  | 59.0M  |                 case HTML_TAG_ARG_VAL:  | 
999  | 59.0M  |                     if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) { | 
1000  |  |                         /* RFC2397 inline data */  | 
1001  |  |  | 
1002  |  |                         /* Rewind one byte so we don't recursive */  | 
1003  | 21.6k  |                         if (file_buff_o2 && (file_buff_o2->length > 0)) { | 
1004  | 21.6k  |                             file_buff_o2->length--;  | 
1005  | 21.6k  |                         }  | 
1006  |  |  | 
1007  | 21.6k  |                         if (quoted != NOT_QUOTED) { | 
1008  | 0  |                             html_output_c(file_buff_o2, '"');  | 
1009  | 0  |                         }  | 
1010  | 21.6k  |                         tag_val_length = 0;  | 
1011  | 21.6k  |                         state          = HTML_RFC2397_TYPE;  | 
1012  | 21.6k  |                         next_state     = HTML_TAG_ARG;  | 
1013  | 59.0M  |                     } else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) { | 
1014  |  |                         /* RFC2397 inline data */  | 
1015  |  |  | 
1016  |  |                         /* Rewind one byte so we don't recursive */  | 
1017  | 117k  |                         if (file_buff_o2 && (file_buff_o2->length > 0)) { | 
1018  | 117k  |                             file_buff_o2->length--;  | 
1019  | 117k  |                         }  | 
1020  |  |  | 
1021  | 117k  |                         if (quoted != NOT_QUOTED) { | 
1022  | 117k  |                             html_output_c(file_buff_o2, '"');  | 
1023  | 117k  |                         }  | 
1024  |  |  | 
1025  | 117k  |                         tag_val_length = 0;  | 
1026  | 117k  |                         state          = HTML_RFC2397_TYPE;  | 
1027  | 117k  |                         next_state     = HTML_TAG_ARG;  | 
1028  | 58.9M  |                     } else if (*ptr == '&') { | 
1029  | 1.00M  |                         state      = HTML_CHAR_REF;  | 
1030  | 1.00M  |                         next_state = HTML_TAG_ARG_VAL;  | 
1031  | 1.00M  |                         ptr++;  | 
1032  | 57.9M  |                     } else if (*ptr == '\'') { | 
1033  | 766k  |                         if (tag_val_length == 0) { | 
1034  | 287k  |                             quoted = SINGLE_QUOTED;  | 
1035  | 287k  |                             html_output_c(file_buff_o2, '"');  | 
1036  | 287k  |                             if (tag_val_length < HTML_STR_LENGTH) { | 
1037  | 287k  |                                 tag_val[tag_val_length++] = '"';  | 
1038  | 287k  |                             }  | 
1039  | 287k  |                             ptr++;  | 
1040  | 479k  |                         } else { | 
1041  | 479k  |                             if (!escape && (quoted == SINGLE_QUOTED)) { | 
1042  | 121k  |                                 html_output_c(file_buff_o2, '"');  | 
1043  | 121k  |                                 if (tag_val_length < HTML_STR_LENGTH) { | 
1044  | 120k  |                                     tag_val[tag_val_length++] = '"';  | 
1045  | 120k  |                                 }  | 
1046  | 121k  |                                 tag_val[tag_val_length] = '\0';  | 
1047  | 121k  |                                 html_tag_arg_add(&tag_args, tag_arg, tag_val);  | 
1048  | 121k  |                                 ptr++;  | 
1049  | 121k  |                                 state          = HTML_SKIP_WS;  | 
1050  | 121k  |                                 tag_arg_length = 0;  | 
1051  | 121k  |                                 next_state     = HTML_TAG_ARG;  | 
1052  | 357k  |                             } else { | 
1053  | 357k  |                                 html_output_c(file_buff_o2, '"');  | 
1054  | 357k  |                                 if (tag_val_length < HTML_STR_LENGTH) { | 
1055  | 320k  |                                     tag_val[tag_val_length++] = '"';  | 
1056  | 320k  |                                 }  | 
1057  | 357k  |                                 ptr++;  | 
1058  | 357k  |                             }  | 
1059  | 479k  |                         }  | 
1060  | 57.1M  |                     } else if (*ptr == '"') { | 
1061  | 1.03M  |                         if (tag_val_length == 0) { | 
1062  | 508k  |                             quoted = DOUBLE_QUOTED;  | 
1063  | 508k  |                             html_output_c(file_buff_o2, '"');  | 
1064  | 508k  |                             if (tag_val_length < HTML_STR_LENGTH) { | 
1065  | 508k  |                                 tag_val[tag_val_length++] = '"';  | 
1066  | 508k  |                             }  | 
1067  | 508k  |                             ptr++;  | 
1068  | 527k  |                         } else { | 
1069  | 527k  |                             if (!escape && (quoted == DOUBLE_QUOTED)) { | 
1070  | 160k  |                                 html_output_c(file_buff_o2, '"');  | 
1071  | 160k  |                                 if (tag_val_length < HTML_STR_LENGTH) { | 
1072  | 159k  |                                     tag_val[tag_val_length++] = '"';  | 
1073  | 159k  |                                 }  | 
1074  | 160k  |                                 tag_val[tag_val_length] = '\0';  | 
1075  | 160k  |                                 html_tag_arg_add(&tag_args, tag_arg, tag_val);  | 
1076  | 160k  |                                 ptr++;  | 
1077  | 160k  |                                 state          = HTML_SKIP_WS;  | 
1078  | 160k  |                                 tag_arg_length = 0;  | 
1079  | 160k  |                                 next_state     = HTML_TAG_ARG;  | 
1080  | 366k  |                             } else { | 
1081  | 366k  |                                 html_output_c(file_buff_o2, '"');  | 
1082  | 366k  |                                 if (tag_val_length < HTML_STR_LENGTH) { | 
1083  | 363k  |                                     tag_val[tag_val_length++] = '"';  | 
1084  | 363k  |                                 }  | 
1085  | 366k  |                                 ptr++;  | 
1086  | 366k  |                             }  | 
1087  | 527k  |                         }  | 
1088  | 56.1M  |                     } else if (isspace(*ptr) || (*ptr == '>')) { | 
1089  | 2.50M  |                         if (quoted == NOT_QUOTED) { | 
1090  | 2.11M  |                             tag_val[tag_val_length] = '\0';  | 
1091  | 2.11M  |                             html_tag_arg_add(&tag_args, tag_arg, tag_val);  | 
1092  | 2.11M  |                             state          = HTML_SKIP_WS;  | 
1093  | 2.11M  |                             tag_arg_length = 0;  | 
1094  | 2.11M  |                             next_state     = HTML_TAG_ARG;  | 
1095  | 2.11M  |                         } else { | 
1096  | 395k  |                             html_output_c(file_buff_o2, *ptr);  | 
1097  | 395k  |                             if (tag_val_length < HTML_STR_LENGTH) { | 
1098  | 393k  |                                 if (isspace(*ptr)) { | 
1099  | 107k  |                                     tag_val[tag_val_length++] = ' ';  | 
1100  | 285k  |                                 } else { | 
1101  | 285k  |                                     tag_val[tag_val_length++] = '>';  | 
1102  | 285k  |                                 }  | 
1103  | 393k  |                             }  | 
1104  | 395k  |                             state      = HTML_SKIP_WS;  | 
1105  | 395k  |                             escape     = false;  | 
1106  | 395k  |                             quoted     = NOT_QUOTED;  | 
1107  | 395k  |                             next_state = HTML_TAG_ARG_VAL;  | 
1108  | 395k  |                             ptr++;  | 
1109  | 395k  |                         }  | 
1110  | 53.6M  |                     } else { | 
1111  | 53.6M  |                         if (mbchar2 && (*ptr < 0x80 || mbchar2 >= 0x10000)) { | 
1112  | 5.67M  |                             if (mbchar2 == 0xE38082 || mbchar2 == 0xEFBC8E || mbchar2 == 0xEFB992 ||  | 
1113  | 5.67M  |                                 (mbchar2 == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) { | 
1114  | 174k  |                                 html_output_c(file_buff_o2, '.');  | 
1115  | 174k  |                                 if (tag_val_length < HTML_STR_LENGTH)  | 
1116  | 172k  |                                     tag_val[tag_val_length++] = '.';  | 
1117  | 174k  |                                 if (mbchar2 == 0xA1) { | 
1118  | 149k  |                                     ptr++;  | 
1119  | 149k  |                                     mbchar2 = 0;  | 
1120  | 149k  |                                     continue;  | 
1121  | 149k  |                                 }  | 
1122  | 5.50M  |                             } else { | 
1123  | 5.50M  |                                 uint8_t c0 = mbchar2 >> 16;  | 
1124  | 5.50M  |                                 uint8_t c1 = (mbchar2 >> 8) & 0xff;  | 
1125  | 5.50M  |                                 uint8_t c2 = (mbchar2 & 0xff);  | 
1126  | 5.50M  |                                 if (c0)  | 
1127  | 3.12M  |                                     html_output_c(file_buff_o2, c0);  | 
1128  | 5.50M  |                                 if (c0 || c1)  | 
1129  | 3.69M  |                                     html_output_c(file_buff_o2, c1);  | 
1130  | 5.50M  |                                 html_output_c(file_buff_o2, c2);  | 
1131  | 5.50M  |                                 if (c0 && tag_val_length < HTML_STR_LENGTH)  | 
1132  | 2.72M  |                                     tag_val[tag_val_length++] = c0;  | 
1133  | 5.50M  |                                 if ((c0 || c1) && tag_val_length < HTML_STR_LENGTH)  | 
1134  | 3.28M  |                                     tag_val[tag_val_length++] = c1;  | 
1135  | 5.50M  |                                 if (tag_val_length < HTML_STR_LENGTH)  | 
1136  | 5.06M  |                                     tag_val[tag_val_length++] = c2;  | 
1137  | 5.50M  |                             }  | 
1138  | 5.52M  |                             mbchar2 = 0;  | 
1139  | 5.52M  |                         }  | 
1140  | 53.4M  |                         if (*ptr >= 0x80)  | 
1141  | 12.5M  |                             mbchar2 = (mbchar2 << 8) | *ptr;  | 
1142  | 40.8M  |                         else { | 
1143  | 40.8M  |                             html_output_c(file_buff_o2, tolower(*ptr));  | 
1144  | 40.8M  |                             if (tag_val_length < HTML_STR_LENGTH) { | 
1145  | 37.9M  |                                 tag_val[tag_val_length++] = *ptr;  | 
1146  | 37.9M  |                             }  | 
1147  | 40.8M  |                         }  | 
1148  | 53.4M  |                         ptr++;  | 
1149  | 53.4M  |                     }  | 
1150  |  |  | 
1151  | 58.9M  |                     if (*ptr == '\\') { | 
1152  | 68.1k  |                         escape = true;  | 
1153  | 58.8M  |                     } else { | 
1154  | 58.8M  |                         escape = false;  | 
1155  | 58.8M  |                     }  | 
1156  | 58.9M  |                     break;  | 
1157  | 70.5M  |                 case HTML_COMMENT:  | 
1158  | 70.5M  |                     if (in_tag != TAG_DONT_EXTRACT && !isspace(*ptr)) { | 
1159  | 60.3M  |                         unsigned char c = tolower(*ptr);  | 
1160  |  |                         /* dump script to nocomment.html, since we no longer have  | 
1161  |  |                          * comment.html/script.html */  | 
1162  | 60.3M  |                         if (c == '\'') c = '"';  | 
1163  | 60.3M  |                         html_output_c(file_buff_o2, c);  | 
1164  | 60.3M  |                     }  | 
1165  | 70.5M  |                     if (*ptr == '>') { | 
1166  | 520k  |                         state      = HTML_SKIP_WS;  | 
1167  | 520k  |                         next_state = HTML_NORM;  | 
1168  | 520k  |                     }  | 
1169  | 70.5M  |                     ptr++;  | 
1170  | 70.5M  |                     break;  | 
1171  | 9.60M  |                 case HTML_PROCESS_TAG:  | 
1172  |  |  | 
1173  |  |                     /* Default to no action for this tag */  | 
1174  | 9.60M  |                     state      = HTML_SKIP_WS;  | 
1175  | 9.60M  |                     next_state = HTML_NORM;  | 
1176  | 9.60M  |                     if (tag[0] == '/') { | 
1177  |  |                         /* End tag */  | 
1178  | 2.22M  |                         state      = HTML_SKIP_WS;  | 
1179  | 2.22M  |                         next_state = HTML_NORM;  | 
1180  |  |  | 
1181  | 2.22M  |                         if (strcmp(tag, "/script") == 0) { | 
1182  | 667k  |                             in_tag = TAG_DONT_EXTRACT;  | 
1183  | 667k  |                             if (js_state) { | 
1184  | 184k  |                                 js_end = ptr;  | 
1185  | 184k  |                                 js_process(js_state, js_begin, js_end, line, ptr, in_tag, dirname);  | 
1186  | 184k  |                                 js_state = NULL;  | 
1187  | 184k  |                                 js_begin = js_end = NULL;  | 
1188  | 184k  |                             }  | 
1189  |  |                             /*don't output newlines in nocomment.html  | 
1190  |  |                              * html_output_c(file_buff_o2, '\n');*/  | 
1191  | 1.55M  |                         } else if ((strcmp(tag, "/style") == 0) && (in_tag == TAG_STYLE)) { | 
1192  | 19.8k  |                             size_t chunk_size;  | 
1193  |  |  | 
1194  | 19.8k  |                             style_end = ptr - strlen("</style>"); | 
1195  |  |  | 
1196  | 19.8k  |                             if (style_end < style_begin) { | 
1197  | 416  |                                 cli_dbgmsg("cli_html_normalise: style chunk size underflow\n"); | 
1198  | 416  |                                 goto done;  | 
1199  | 416  |                             }  | 
1200  |  |  | 
1201  | 19.4k  |                             chunk_size = style_end - style_begin;  | 
1202  |  |  | 
1203  | 19.4k  |                             if (style_buff == NULL) { | 
1204  | 14.4k  |                                 CLI_MAX_MALLOC_OR_GOTO_DONE(style_buff, chunk_size + 1);  | 
1205  | 14.4k  |                             } else { | 
1206  | 5.03k  |                                 CLI_MAX_REALLOC_OR_GOTO_DONE(style_buff, style_buff_size + chunk_size + 1);  | 
1207  | 5.03k  |                             }  | 
1208  |  |  | 
1209  | 19.4k  |                             memcpy(style_buff + style_buff_size, style_begin, chunk_size);  | 
1210  |  |  | 
1211  | 19.4k  |                             style_buff_size += chunk_size;  | 
1212  | 19.4k  |                             style_buff[style_buff_size] = '\0';  | 
1213  |  |  | 
1214  | 19.4k  |                             in_tag      = TAG_DONT_EXTRACT;  | 
1215  | 19.4k  |                             style_begin = style_end = NULL;  | 
1216  | 19.4k  |                         }  | 
1217  |  |  | 
1218  | 2.22M  |                         if (hrefs && hrefs->scanContents && in_ahref) { | 
1219  | 0  |                             if (strcmp(tag, "/a") == 0) { | 
1220  | 0  |                                 html_tag_contents_done(hrefs, in_ahref, &contents);  | 
1221  | 0  |                                 in_ahref = 0; /* we are no longer inside an <a href>  | 
1222  |  |                                                         nesting <a> tags not supported, and shouldn't be supported*/  | 
1223  | 0  |                             }  | 
1224  | 0  |                             href_contents_begin = ptr;  | 
1225  | 0  |                         }  | 
1226  | 2.22M  |                         if (strcmp(tag, "/form") == 0) { | 
1227  | 790  |                             if (in_form_action)  | 
1228  | 0  |                                 free(in_form_action);  | 
1229  | 790  |                             in_form_action = NULL;  | 
1230  | 790  |                         }  | 
1231  | 7.38M  |                     } else if (strcmp(tag, "script") == 0) { | 
1232  | 364k  |                         arg_value = html_tag_arg_value(&tag_args, "language");  | 
1233  |  |                         /* TODO: maybe we can output all tags only via html_output_tag */  | 
1234  | 364k  |                         if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0)) { | 
1235  | 7.10k  |                             html_tag_arg_set(&tag_args, "language", "javascript");  | 
1236  | 7.10k  |                             state      = HTML_SKIP_WS;  | 
1237  | 7.10k  |                             next_state = HTML_JSDECODE;  | 
1238  |  |                             /* we already output the old tag, output the new tag now */  | 
1239  | 7.10k  |                             html_output_tag(file_buff_o2, tag, &tag_args);  | 
1240  | 357k  |                         } else if (arg_value && (strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) { | 
1241  | 7.65k  |                             html_tag_arg_set(&tag_args, "language", "vbscript");  | 
1242  | 7.65k  |                             state      = HTML_SKIP_WS;  | 
1243  | 7.65k  |                             next_state = HTML_JSDECODE;  | 
1244  |  |                             /* we already output the old tag, output the new tag now */  | 
1245  | 7.65k  |                             html_output_tag(file_buff_o2, tag, &tag_args);  | 
1246  | 7.65k  |                         }  | 
1247  | 364k  |                         in_tag = TAG_SCRIPT;  | 
1248  | 364k  |                         if (dconf_js && !js_state) { | 
1249  | 364k  |                             js_state = cli_js_init();  | 
1250  | 364k  |                             if (!js_state) { | 
1251  | 0  |                                 cli_dbgmsg("htmlnorm: Failed to initialize js parser\n"); | 
1252  | 0  |                             }  | 
1253  | 364k  |                             js_begin = ptr;  | 
1254  | 364k  |                             js_end   = NULL;  | 
1255  | 364k  |                         }  | 
1256  | 7.01M  |                     } else if (strcmp(tag, "style") == 0) { | 
1257  | 35.7k  |                         in_tag      = TAG_STYLE;  | 
1258  | 35.7k  |                         style_begin = ptr;  | 
1259  | 35.7k  |                         style_end   = NULL;  | 
1260  | 6.98M  |                     } else if (strcmp(tag, "%@") == 0) { | 
1261  | 675k  |                         arg_value = html_tag_arg_value(&tag_args, "language");  | 
1262  | 675k  |                         if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0 ||  | 
1263  | 486k  |                                           strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) { | 
1264  |  |  | 
1265  | 202k  |                             saved_next_state = next_state;  | 
1266  | 202k  |                             next_state       = state;  | 
1267  | 202k  |                             look_for_screnc  = false;  | 
1268  | 202k  |                             state            = HTML_LOOKFOR_SCRENC;  | 
1269  | 202k  |                         }  | 
1270  | 6.30M  |                     } else if (hrefs) { | 
1271  | 0  |                         if (in_ahref && !href_contents_begin)  | 
1272  | 0  |                             href_contents_begin = ptr;  | 
1273  | 0  |                         if (strcmp(tag, "a") == 0) { | 
1274  | 0  |                             arg_value = html_tag_arg_value(&tag_args, "href");  | 
1275  | 0  |                             if (arg_value && strlen((const char *)arg_value) > 0) { | 
1276  | 0  |                                 if (hrefs->scanContents) { | 
1277  | 0  |                                     char *arg_value_title = html_tag_arg_value(&tag_args, "title");  | 
1278  |  |                                     /*beginning of an <a> tag*/  | 
1279  | 0  |                                     if (in_ahref)  | 
1280  |  |                                         /*we encountered nested <a> tags, pretend previous closed*/  | 
1281  | 0  |                                         if (href_contents_begin) { | 
1282  | 0  |                                             html_tag_contents_append(&contents, href_contents_begin, ptrend);  | 
1283  |  |                                             /*add pending contents between tags*/  | 
1284  | 0  |                                             html_tag_contents_done(hrefs, in_ahref, &contents);  | 
1285  | 0  |                                             in_ahref = 0;  | 
1286  | 0  |                                         }  | 
1287  | 0  |                                     if (arg_value_title) { | 
1288  |  |                                         /* title is a 'displayed link'*/  | 
1289  | 0  |                                         html_tag_arg_add(hrefs, "href_title", arg_value_title);  | 
1290  | 0  |                                         html_tag_contents_append(&contents, (const unsigned char *)arg_value,  | 
1291  | 0  |                                                                  (const unsigned char *)arg_value + strlen(arg_value));  | 
1292  | 0  |                                         html_tag_contents_done(hrefs, hrefs->count, &contents);  | 
1293  | 0  |                                     }  | 
1294  | 0  |                                     if (in_form_action) { | 
1295  |  |                                         /* form action is the real URL, and href is the 'displayed' */  | 
1296  | 0  |                                         html_tag_arg_add(hrefs, "form", arg_value);  | 
1297  | 0  |                                         contents.pos = 0;  | 
1298  | 0  |                                         html_tag_contents_append(&contents, in_form_action,  | 
1299  | 0  |                                                                  in_form_action + strlen((const char *)in_form_action));  | 
1300  | 0  |                                         html_tag_contents_done(hrefs, hrefs->count, &contents);  | 
1301  | 0  |                                     }  | 
1302  | 0  |                                 }  | 
1303  | 0  |                                 html_tag_arg_add(hrefs, "href", arg_value);  | 
1304  | 0  |                                 if (hrefs->scanContents) { | 
1305  | 0  |                                     in_ahref            = hrefs->count; /* index of this tag (counted from 1) */  | 
1306  | 0  |                                     href_contents_begin = ptr;          /* contents begin after <a ..> ends */  | 
1307  | 0  |                                     contents.pos        = 0;  | 
1308  | 0  |                                 }  | 
1309  | 0  |                             }  | 
1310  | 0  |                         } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) { | 
1311  | 0  |                             const char *arg_action_value = html_tag_arg_value(&tag_args, "action");  | 
1312  | 0  |                             if (arg_action_value) { | 
1313  | 0  |                                 if (in_form_action)  | 
1314  | 0  |                                     free(in_form_action);  | 
1315  | 0  |                                 in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);  | 
1316  | 0  |                             }  | 
1317  | 0  |                         } else if (strcmp(tag, "img") == 0) { | 
1318  | 0  |                             arg_value = html_tag_arg_value(&tag_args, "src");  | 
1319  | 0  |                             if (arg_value && strlen(arg_value) > 0) { | 
1320  | 0  |                                 html_tag_arg_add(hrefs, "src", arg_value);  | 
1321  | 0  |                                 if (hrefs->scanContents && in_ahref)  | 
1322  |  |                                     /* "contents" of an img tag, is the URL of its parent <a> tag */  | 
1323  | 0  |                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);  | 
1324  | 0  |                                 if (in_form_action) { | 
1325  |  |                                     /* form action is the real URL, and href is the 'displayed' */  | 
1326  | 0  |                                     html_tag_arg_add(hrefs, "form", arg_value);  | 
1327  | 0  |                                     contents.pos = 0;  | 
1328  | 0  |                                     html_tag_contents_append(&contents, in_form_action,  | 
1329  | 0  |                                                              in_form_action + strlen((const char *)in_form_action));  | 
1330  | 0  |                                     html_tag_contents_done(hrefs, hrefs->count, &contents);  | 
1331  | 0  |                                 }  | 
1332  | 0  |                             }  | 
1333  | 0  |                             arg_value = html_tag_arg_value(&tag_args, "dynsrc");  | 
1334  | 0  |                             if (arg_value && strlen(arg_value) > 0) { | 
1335  | 0  |                                 html_tag_arg_add(hrefs, "dynsrc", arg_value);  | 
1336  | 0  |                                 if (hrefs->scanContents && in_ahref)  | 
1337  |  |                                     /* see above */  | 
1338  | 0  |                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);  | 
1339  | 0  |                                 if (in_form_action) { | 
1340  |  |                                     /* form action is the real URL, and href is the 'displayed' */  | 
1341  | 0  |                                     html_tag_arg_add(hrefs, "form", arg_value);  | 
1342  | 0  |                                     contents.pos = 0;  | 
1343  | 0  |                                     html_tag_contents_append(&contents, in_form_action,  | 
1344  | 0  |                                                              in_form_action + strlen((const char *)in_form_action));  | 
1345  | 0  |                                     html_tag_contents_done(hrefs, hrefs->count, &contents);  | 
1346  | 0  |                                 }  | 
1347  | 0  |                             }  | 
1348  | 0  |                         } else if (strcmp(tag, "iframe") == 0) { | 
1349  | 0  |                             arg_value = html_tag_arg_value(&tag_args, "src");  | 
1350  | 0  |                             if (arg_value && strlen(arg_value) > 0) { | 
1351  | 0  |                                 html_tag_arg_add(hrefs, "iframe", arg_value);  | 
1352  | 0  |                                 if (hrefs->scanContents && in_ahref)  | 
1353  |  |                                     /* see above */  | 
1354  | 0  |                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);  | 
1355  | 0  |                                 if (in_form_action) { | 
1356  |  |                                     /* form action is the real URL, and href is the 'displayed' */  | 
1357  | 0  |                                     html_tag_arg_add(hrefs, "form", arg_value);  | 
1358  | 0  |                                     contents.pos = 0;  | 
1359  | 0  |                                     html_tag_contents_append(&contents, in_form_action,  | 
1360  | 0  |                                                              in_form_action + strlen((const char *)in_form_action));  | 
1361  | 0  |                                     html_tag_contents_done(hrefs, hrefs->count, &contents);  | 
1362  | 0  |                                 }  | 
1363  | 0  |                             }  | 
1364  | 0  |                         } else if (strcmp(tag, "area") == 0) { | 
1365  | 0  |                             arg_value = html_tag_arg_value(&tag_args, "href");  | 
1366  | 0  |                             if (arg_value && strlen(arg_value) > 0) { | 
1367  | 0  |                                 html_tag_arg_add(hrefs, "area", arg_value);  | 
1368  | 0  |                                 if (hrefs->scanContents && in_ahref)  | 
1369  |  |                                     /* see above */  | 
1370  | 0  |                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);  | 
1371  | 0  |                                 if (in_form_action) { | 
1372  |  |                                     /* form action is the real URL, and href is the 'displayed' */  | 
1373  | 0  |                                     html_tag_arg_add(hrefs, "form", arg_value);  | 
1374  | 0  |                                     contents.pos = 0;  | 
1375  | 0  |                                     html_tag_contents_append(&contents, in_form_action,  | 
1376  | 0  |                                                              in_form_action + strlen((const char *)in_form_action));  | 
1377  | 0  |                                     html_tag_contents_done(hrefs, hrefs->count, &contents);  | 
1378  | 0  |                                 }  | 
1379  | 0  |                             }  | 
1380  | 0  |                         }  | 
1381  |  |                         /* TODO:imagemaps can have urls too */  | 
1382  | 6.30M  |                     } else if (strcmp(tag, "a") == 0) { | 
1383  |  |                         /* a/img tags for buff_text can be processed only if we're not processing hrefs */  | 
1384  | 744k  |                         arg_value = html_tag_arg_value(&tag_args, "href");  | 
1385  | 744k  |                         if (arg_value && arg_value[0]) { | 
1386  | 539k  |                             html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));  | 
1387  | 539k  |                             html_output_c(file_buff_text, ' ');  | 
1388  | 539k  |                             text_space_written = true;  | 
1389  | 539k  |                         }  | 
1390  | 5.56M  |                     } else if (strcmp(tag, "img") == 0) { | 
1391  | 94.2k  |                         arg_value = html_tag_arg_value(&tag_args, "src");  | 
1392  | 94.2k  |                         if (arg_value && arg_value[0]) { | 
1393  | 15.9k  |                             html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));  | 
1394  | 15.9k  |                             html_output_c(file_buff_text, ' ');  | 
1395  | 15.9k  |                             text_space_written = true;  | 
1396  | 15.9k  |                         }  | 
1397  | 94.2k  |                     }  | 
1398  | 9.60M  |                     html_tag_arg_free(&tag_args);  | 
1399  | 9.60M  |                     break;  | 
1400  | 10.8M  |                 case HTML_CHAR_REF:  | 
1401  | 10.8M  |                     if (*ptr == '#') { | 
1402  | 2.96M  |                         value = 0;  | 
1403  | 2.96M  |                         hex   = false;  | 
1404  | 2.96M  |                         state = HTML_CHAR_REF_DECODE;  | 
1405  | 2.96M  |                         ptr++;  | 
1406  | 7.89M  |                     } else { | 
1407  | 7.89M  |                         if (dconf_entconv)  | 
1408  | 7.89M  |                             state = HTML_ENTITY_REF_DECODE;  | 
1409  | 0  |                         else { | 
1410  | 0  |                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1411  | 0  |                                 tag_val[tag_val_length++] = '&';  | 
1412  | 0  |                             }  | 
1413  | 0  |                             html_output_c(file_buff_o2, '&');  | 
1414  |  | 
  | 
1415  | 0  |                             state      = next_state;  | 
1416  | 0  |                             next_state = HTML_BAD_STATE;  | 
1417  | 0  |                         }  | 
1418  | 7.89M  |                     }  | 
1419  | 10.8M  |                     break;  | 
1420  | 19.7M  |                 case HTML_ENTITY_REF_DECODE:  | 
1421  | 19.7M  |                     if (*ptr == ';') { | 
1422  | 1.14M  |                         size_t i;  | 
1423  | 1.14M  |                         const char *normalized;  | 
1424  | 1.14M  |                         entity_val[entity_val_length] = '\0';  | 
1425  | 1.14M  |                         normalized                    = entity_norm(&conv, entity_val);  | 
1426  | 1.14M  |                         if (normalized) { | 
1427  | 527k  |                             for (i = 0; i < strlen(normalized); i++) { | 
1428  | 425k  |                                 const unsigned char c = normalized[i] & 0xff;  | 
1429  | 425k  |                                 html_output_c(file_buff_o2, c);  | 
1430  | 425k  |                                 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1431  | 59.3k  |                                     tag_val[tag_val_length++] = c;  | 
1432  | 59.3k  |                                 }  | 
1433  | 425k  |                             }  | 
1434  | 1.04M  |                         } else { | 
1435  | 1.04M  |                             html_output_c(file_buff_o2, '&');  | 
1436  | 1.04M  |                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1437  | 270k  |                                 tag_val[tag_val_length++] = '&';  | 
1438  | 270k  |                             }  | 
1439  | 2.57M  |                             for (i = 0; i < entity_val_length; i++) { | 
1440  | 1.52M  |                                 const char c = tolower(entity_val[i]);  | 
1441  | 1.52M  |                                 html_output_c(file_buff_o2, c);  | 
1442  | 1.52M  |                                 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1443  | 290k  |                                     tag_val[tag_val_length++] = c;  | 
1444  | 290k  |                                 }  | 
1445  | 1.52M  |                             }  | 
1446  | 1.04M  |                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1447  | 270k  |                                 tag_val[tag_val_length++] = ';';  | 
1448  | 270k  |                             }  | 
1449  | 1.04M  |                             html_output_c(file_buff_o2, ';');  | 
1450  | 1.04M  |                         }  | 
1451  | 1.14M  |                         entity_val_length = 0;  | 
1452  | 1.14M  |                         state             = next_state;  | 
1453  | 1.14M  |                         next_state        = HTML_BAD_STATE;  | 
1454  | 1.14M  |                         ptr++;  | 
1455  | 18.6M  |                     } else if ((isalnum(*ptr) || *ptr == '_' || *ptr == ':' || (*ptr == '-')) && entity_val_length < HTML_STR_LENGTH) { | 
1456  | 11.8M  |                         entity_val[entity_val_length++] = *ptr++;  | 
1457  | 11.8M  |                     } else { | 
1458  |  |                         /* entity too long, or not valid, dump it */  | 
1459  | 6.74M  |                         size_t i;  | 
1460  | 6.74M  |                         if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1461  | 468k  |                             tag_val[tag_val_length++] = '&';  | 
1462  | 468k  |                         }  | 
1463  | 6.74M  |                         html_output_c(file_buff_o2, '&');  | 
1464  | 16.5M  |                         for (i = 0; i < entity_val_length; i++) { | 
1465  | 9.75M  |                             const char c = tolower(entity_val[i]);  | 
1466  | 9.75M  |                             html_output_c(file_buff_o2, c);  | 
1467  | 9.75M  |                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1468  | 836k  |                                 tag_val[tag_val_length++] = c;  | 
1469  | 836k  |                             }  | 
1470  | 9.75M  |                         }  | 
1471  |  |  | 
1472  | 6.74M  |                         state             = next_state;  | 
1473  | 6.74M  |                         next_state        = HTML_BAD_STATE;  | 
1474  | 6.74M  |                         entity_val_length = 0;  | 
1475  | 6.74M  |                     }  | 
1476  | 19.7M  |                     break;  | 
1477  | 9.67M  |                 case HTML_CHAR_REF_DECODE:  | 
1478  | 9.67M  |                     if ((value == 0) && ((*ptr == 'x') || (*ptr == 'X'))) { | 
1479  | 217k  |                         hex = true;  | 
1480  | 217k  |                         ptr++;  | 
1481  | 9.46M  |                     } else if (*ptr == ';') { | 
1482  | 2.02M  |                         if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) { | 
1483  | 164k  |                             tag_val[tag_val_length++] = value; /* store encoded values too */  | 
1484  | 164k  |                         }  | 
1485  | 2.02M  |                         if (dconf_entconv) { | 
1486  |  |  | 
1487  | 2.02M  |                             if (value < 0x80)  | 
1488  | 1.82M  |                                 html_output_c(file_buff_o2, tolower(value));  | 
1489  | 197k  |                             else { | 
1490  | 197k  |                                 unsigned char buff[10];  | 
1491  | 197k  |                                 unsigned char *out = u16_normalize_tobuffer(value, buff, 10);  | 
1492  | 197k  |                                 if (out && out > buff) { | 
1493  | 197k  |                                     html_output_str(file_buff_o2, buff, out - buff - 1);  | 
1494  | 197k  |                                 }  | 
1495  | 197k  |                             }  | 
1496  | 2.02M  |                         } else  | 
1497  | 0  |                             html_output_c(file_buff_o2, tolower(value & 0xff));  | 
1498  | 2.02M  |                         state      = next_state;  | 
1499  | 2.02M  |                         next_state = HTML_BAD_STATE;  | 
1500  | 2.02M  |                         ptr++;  | 
1501  | 7.44M  |                     } else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) { | 
1502  | 6.52M  |                         int64_t increment = 0;  | 
1503  |  |  | 
1504  | 6.52M  |                         if (hex && value < INT64_MAX / 16) { | 
1505  | 269k  |                             value *= 16;  | 
1506  | 6.25M  |                         } else if (value < INT64_MAX / 10) { | 
1507  | 6.23M  |                             value *= 10;  | 
1508  | 6.23M  |                         } else { | 
1509  | 27.6k  |                             html_output_c(file_buff_o2, value);  | 
1510  | 27.6k  |                             state      = next_state;  | 
1511  | 27.6k  |                             next_state = HTML_BAD_STATE;  | 
1512  | 27.6k  |                             ptr++;  | 
1513  | 27.6k  |                             break;  | 
1514  | 27.6k  |                         }  | 
1515  | 6.50M  |                         if (isdigit(*ptr)) { | 
1516  | 6.39M  |                             increment = *ptr - '0';  | 
1517  | 6.39M  |                         } else { | 
1518  | 104k  |                             increment = tolower(*ptr) - 'a' + 10;  | 
1519  | 104k  |                         }  | 
1520  | 6.50M  |                         if (value > INT64_MAX - increment) { | 
1521  |  |                             /* Addition would result in integer overflow. */  | 
1522  | 0  |                             html_output_c(file_buff_o2, value);  | 
1523  | 0  |                             state      = next_state;  | 
1524  | 0  |                             next_state = HTML_BAD_STATE;  | 
1525  | 0  |                             ptr++;  | 
1526  | 0  |                             break;  | 
1527  | 0  |                         }  | 
1528  | 6.50M  |                         value += increment;  | 
1529  | 6.50M  |                         ptr++;  | 
1530  | 6.50M  |                     } else { | 
1531  | 912k  |                         html_output_c(file_buff_o2, value);  | 
1532  | 912k  |                         state      = next_state;  | 
1533  | 912k  |                         next_state = HTML_BAD_STATE;  | 
1534  | 912k  |                     }  | 
1535  | 9.65M  |                     break;  | 
1536  | 9.65M  |                 case HTML_LOOKFOR_SCRENC:  | 
1537  | 278k  |                     look_for_screnc = true;  | 
1538  | 278k  |                     ptr_screnc      = (unsigned char *)strstr((char *)ptr, "#@~^");  | 
1539  | 278k  |                     if (ptr_screnc) { | 
1540  | 263k  |                         ptr_screnc[0] = '/';  | 
1541  | 263k  |                         ptr_screnc[1] = '/';  | 
1542  | 263k  |                         ptr_screnc += 4;  | 
1543  | 263k  |                     }  | 
1544  | 278k  |                     state      = next_state;  | 
1545  | 278k  |                     next_state = saved_next_state;  | 
1546  | 278k  |                     break;  | 
1547  | 7.07M  |                 case HTML_JSDECODE:  | 
1548  |  |                     /* Check for start marker */  | 
1549  | 7.07M  |                     if (strncmp((const char *)ptr, "#@~^", 4) == 0) { | 
1550  | 4.94k  |                         ptr[0] = '/';  | 
1551  | 4.94k  |                         ptr[1] = '/';  | 
1552  | 4.94k  |                         ptr += 4;  | 
1553  | 4.94k  |                         state      = HTML_JSDECODE_LENGTH;  | 
1554  | 4.94k  |                         next_state = HTML_BAD_STATE;  | 
1555  | 7.06M  |                     } else { | 
1556  | 7.06M  |                         html_output_c(file_buff_o2, tolower(*ptr));  | 
1557  | 7.06M  |                         ptr++;  | 
1558  | 7.06M  |                     }  | 
1559  | 7.07M  |                     break;  | 
1560  | 248k  |                 case HTML_JSDECODE_LENGTH:  | 
1561  | 248k  |                     if (strlen((const char *)ptr) < 8) { | 
1562  | 837  |                         state      = HTML_NORM;  | 
1563  | 837  |                         next_state = HTML_BAD_STATE;  | 
1564  | 837  |                         break;  | 
1565  | 837  |                     }  | 
1566  | 247k  |                     memset(&screnc_state, 0, sizeof(screnc_state));  | 
1567  | 247k  |                     screnc_state.length = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;  | 
1568  | 247k  |                     screnc_state.length += base64_chars[ptr[1]] >> 4;  | 
1569  | 247k  |                     screnc_state.length += (base64_chars[ptr[1]] & 0x0f) << 12;  | 
1570  | 247k  |                     screnc_state.length += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;  | 
1571  | 247k  |                     screnc_state.length += (base64_chars[ptr[2]] & 0x03) << 22;  | 
1572  | 247k  |                     screnc_state.length += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;  | 
1573  | 247k  |                     screnc_state.length += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;  | 
1574  | 247k  |                     screnc_state.length += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;  | 
1575  | 247k  |                     state      = HTML_JSDECODE_DECRYPT;  | 
1576  | 247k  |                     in_screnc  = true;  | 
1577  | 247k  |                     next_state = HTML_BAD_STATE;  | 
1578  |  |                     /* for JS normalizer */  | 
1579  | 247k  |                     ptr[7] = '\n';  | 
1580  | 247k  |                     ptr += 8;  | 
1581  | 247k  |                     break;  | 
1582  | 268k  |                 case HTML_JSDECODE_DECRYPT:  | 
1583  | 268k  |                     screnc_decode(ptr, &screnc_state);  | 
1584  | 268k  |                     if (!screnc_state.length) { | 
1585  | 210k  |                         state      = HTML_NORM;  | 
1586  | 210k  |                         next_state = HTML_BAD_STATE;  | 
1587  | 210k  |                         in_screnc  = false;  | 
1588  | 210k  |                         break;  | 
1589  | 210k  |                     } else { | 
1590  | 57.9k  |                         state      = HTML_NORM;  | 
1591  | 57.9k  |                         next_state = HTML_BAD_STATE;  | 
1592  | 57.9k  |                     }  | 
1593  | 57.9k  |                     break;  | 
1594  | 5.79M  |                 case HTML_RFC2397_TYPE:  | 
1595  | 5.79M  |                     if (*ptr == '\'') { | 
1596  | 121k  |                         if (!escape && (quoted == SINGLE_QUOTED)) { | 
1597  |  |                             /* Early end of data detected. Error */  | 
1598  | 3.91k  |                             ptr++;  | 
1599  | 3.91k  |                             state          = HTML_SKIP_WS;  | 
1600  | 3.91k  |                             tag_arg_length = 0;  | 
1601  | 3.91k  |                             next_state     = HTML_TAG_ARG;  | 
1602  | 117k  |                         } else { | 
1603  | 117k  |                             if (tag_val_length < HTML_STR_LENGTH) { | 
1604  | 106k  |                                 tag_val[tag_val_length++] = '"';  | 
1605  | 106k  |                             }  | 
1606  | 117k  |                             ptr++;  | 
1607  | 117k  |                         }  | 
1608  | 5.67M  |                     } else if (*ptr == '"') { | 
1609  | 1.22M  |                         if (!escape && (quoted == DOUBLE_QUOTED)) { | 
1610  |  |                             /* Early end of data detected. Error */  | 
1611  | 4.84k  |                             ptr++;  | 
1612  | 4.84k  |                             state          = HTML_SKIP_WS;  | 
1613  | 4.84k  |                             tag_arg_length = 0;  | 
1614  | 4.84k  |                             next_state     = HTML_TAG_ARG;  | 
1615  | 1.22M  |                         } else { | 
1616  | 1.22M  |                             if (tag_val_length < HTML_STR_LENGTH) { | 
1617  | 447k  |                                 tag_val[tag_val_length++] = '"';  | 
1618  | 447k  |                             }  | 
1619  | 1.22M  |                             ptr++;  | 
1620  | 1.22M  |                         }  | 
1621  | 4.44M  |                     } else if (isspace(*ptr) || (*ptr == '>')) { | 
1622  | 45.4k  |                         if (quoted == NOT_QUOTED) { | 
1623  |  |                             /* Early end of data detected. Error */  | 
1624  | 22.0k  |                             state          = HTML_SKIP_WS;  | 
1625  | 22.0k  |                             tag_arg_length = 0;  | 
1626  | 22.0k  |                             next_state     = HTML_TAG_ARG;  | 
1627  | 23.3k  |                         } else { | 
1628  | 23.3k  |                             if (tag_val_length < HTML_STR_LENGTH) { | 
1629  | 22.4k  |                                 if (isspace(*ptr)) { | 
1630  | 5.96k  |                                     tag_val[tag_val_length++] = ' ';  | 
1631  | 16.5k  |                                 } else { | 
1632  | 16.5k  |                                     tag_val[tag_val_length++] = '>';  | 
1633  | 16.5k  |                                 }  | 
1634  | 22.4k  |                             }  | 
1635  | 23.3k  |                             state      = HTML_SKIP_WS;  | 
1636  | 23.3k  |                             escape     = false;  | 
1637  | 23.3k  |                             quoted     = NOT_QUOTED;  | 
1638  | 23.3k  |                             next_state = HTML_RFC2397_TYPE;  | 
1639  | 23.3k  |                             ptr++;  | 
1640  | 23.3k  |                         }  | 
1641  | 4.39M  |                     } else if (*ptr == ',') { | 
1642  |  |                         /* Beginning of data */  | 
1643  | 107k  |                         tag_val[tag_val_length] = '\0';  | 
1644  | 107k  |                         state                   = HTML_RFC2397_INIT;  | 
1645  | 107k  |                         escape                  = false;  | 
1646  | 107k  |                         next_state              = HTML_BAD_STATE;  | 
1647  | 107k  |                         ptr++;  | 
1648  |  |  | 
1649  | 4.28M  |                     } else { | 
1650  | 4.28M  |                         if (tag_val_length < HTML_STR_LENGTH) { | 
1651  | 3.59M  |                             tag_val[tag_val_length++] = tolower(*ptr);  | 
1652  | 3.59M  |                         }  | 
1653  | 4.28M  |                         ptr++;  | 
1654  | 4.28M  |                     }  | 
1655  | 5.79M  |                     if (*ptr == '\\') { | 
1656  | 11.1k  |                         escape = true;  | 
1657  | 5.78M  |                     } else { | 
1658  | 5.78M  |                         escape = false;  | 
1659  | 5.78M  |                     }  | 
1660  | 5.79M  |                     break;  | 
1661  | 107k  |                 case HTML_RFC2397_INIT:  | 
1662  | 107k  |                     if (dirname) { | 
1663  | 107k  |                         STATBUF statbuf;  | 
1664  |  |  | 
1665  | 107k  |                         if (NULL != file_tmp_o1) { | 
1666  | 5.33k  |                             if (file_tmp_o1->fd != -1) { | 
1667  | 5.33k  |                                 html_output_flush(file_tmp_o1);  | 
1668  | 5.33k  |                                 close(file_tmp_o1->fd);  | 
1669  | 5.33k  |                                 file_tmp_o1->fd = -1;  | 
1670  | 5.33k  |                             }  | 
1671  | 5.33k  |                             free(file_tmp_o1);  | 
1672  | 5.33k  |                         }  | 
1673  |  |  | 
1674  | 107k  |                         file_tmp_o1 = (file_buff_t *)malloc(sizeof(file_buff_t));  | 
1675  | 107k  |                         if (!file_tmp_o1) { | 
1676  | 0  |                             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_tmp_o1\n"); | 
1677  | 0  |                             goto done;  | 
1678  | 0  |                         }  | 
1679  | 107k  |                         file_tmp_o1->fd = -1;  | 
1680  |  |  | 
1681  |  |                         /* Create rfc2397 directory if it doesn't already exist */  | 
1682  | 107k  |                         snprintf(filename, 1024, "%s" PATHSEP "rfc2397", dirname);  | 
1683  | 107k  |                         if (LSTAT(filename, &statbuf) == -1) { | 
1684  | 17.0k  |                             if (mkdir(filename, 0700) && errno != EEXIST) { | 
1685  | 0  |                                 cli_errmsg("Failed to create directory: %s\n", dirname); | 
1686  | 0  |                                 goto done;  | 
1687  | 0  |                             }  | 
1688  | 17.0k  |                         }  | 
1689  |  |  | 
1690  | 107k  |                         tmp_file = cli_gentemp(filename);  | 
1691  | 107k  |                         if (!tmp_file) { | 
1692  | 0  |                             goto done;  | 
1693  | 0  |                         }  | 
1694  | 107k  |                         cli_dbgmsg("RFC2397 data file: %s\n", tmp_file); | 
1695  | 107k  |                         file_tmp_o1->fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);  | 
1696  | 107k  |                         free(tmp_file);  | 
1697  | 107k  |                         if (file_tmp_o1->fd < 0) { | 
1698  | 0  |                             cli_dbgmsg("open failed: %s\n", filename); | 
1699  | 0  |                             goto done;  | 
1700  | 0  |                         }  | 
1701  | 107k  |                         file_tmp_o1->length = 0;  | 
1702  |  |  | 
1703  | 107k  |                         html_output_str(file_tmp_o1, (const unsigned char *)"From html-normalise\n", 20);  | 
1704  | 107k  |                         html_output_str(file_tmp_o1, (const unsigned char *)"Content-type: ", 14);  | 
1705  | 107k  |                         if ((tag_val_length == 0) && (*tag_val == ';')) { | 
1706  | 0  |                             html_output_str(file_tmp_o1, (const unsigned char *)"text/plain\n", 11);  | 
1707  | 0  |                         }  | 
1708  | 107k  |                         html_output_str(file_tmp_o1, (const unsigned char *)tag_val, tag_val_length);  | 
1709  | 107k  |                         html_output_c(file_tmp_o1, '\n');  | 
1710  | 107k  |                         if (strstr(tag_val, ";base64") != NULL) { | 
1711  | 3.49k  |                             html_output_str(file_tmp_o1, (const unsigned char *)"Content-transfer-encoding: base64\n", 34);  | 
1712  | 3.49k  |                         }  | 
1713  | 107k  |                         html_output_c(file_tmp_o1, '\n');  | 
1714  | 107k  |                     } else { | 
1715  | 0  |                         file_tmp_o1 = NULL;  | 
1716  | 0  |                     }  | 
1717  | 107k  |                     state  = HTML_RFC2397_DATA;  | 
1718  | 107k  |                     binary = true;  | 
1719  | 107k  |                     break;  | 
1720  | 57.3M  |                 case HTML_RFC2397_DATA:  | 
1721  | 57.3M  |                     if (*ptr == '&') { | 
1722  | 71.8k  |                         state      = HTML_CHAR_REF;  | 
1723  | 71.8k  |                         next_state = HTML_RFC2397_DATA;  | 
1724  | 71.8k  |                         ptr++;  | 
1725  | 57.3M  |                     } else if (*ptr == '%') { | 
1726  | 414k  |                         length     = 0;  | 
1727  | 414k  |                         value      = 0;  | 
1728  | 414k  |                         state      = HTML_ESCAPE_CHAR;  | 
1729  | 414k  |                         next_state = HTML_RFC2397_ESC;  | 
1730  | 414k  |                         ptr++;  | 
1731  | 56.8M  |                     } else if (*ptr == '\'') { | 
1732  | 127k  |                         if (!escape && (quoted == SINGLE_QUOTED)) { | 
1733  | 68.6k  |                             state = HTML_RFC2397_FINISH;  | 
1734  | 68.6k  |                             ptr++;  | 
1735  | 68.6k  |                         } else { | 
1736  | 59.0k  |                             html_output_c(file_tmp_o1, *ptr);  | 
1737  | 59.0k  |                             ptr++;  | 
1738  | 59.0k  |                         }  | 
1739  | 56.7M  |                     } else if (*ptr == '\"') { | 
1740  | 7.17M  |                         if (!escape && (quoted == DOUBLE_QUOTED)) { | 
1741  | 7.75k  |                             state = HTML_RFC2397_FINISH;  | 
1742  | 7.75k  |                             ptr++;  | 
1743  | 7.16M  |                         } else { | 
1744  | 7.16M  |                             html_output_c(file_tmp_o1, *ptr);  | 
1745  | 7.16M  |                             ptr++;  | 
1746  | 7.16M  |                         }  | 
1747  | 49.5M  |                     } else if (isspace(*ptr) || (*ptr == '>')) { | 
1748  | 6.62M  |                         if (quoted == NOT_QUOTED) { | 
1749  | 20.3k  |                             state = HTML_RFC2397_FINISH;  | 
1750  | 20.3k  |                             ptr++;  | 
1751  | 6.60M  |                         } else { | 
1752  | 6.60M  |                             html_output_c(file_tmp_o1, *ptr);  | 
1753  | 6.60M  |                             ptr++;  | 
1754  | 6.60M  |                         }  | 
1755  | 42.9M  |                     } else { | 
1756  | 42.9M  |                         html_output_c(file_tmp_o1, *ptr);  | 
1757  | 42.9M  |                         ptr++;  | 
1758  | 42.9M  |                     }  | 
1759  | 57.3M  |                     if (*ptr == '\\') { | 
1760  | 62.3k  |                         escape = true;  | 
1761  | 57.3M  |                     } else { | 
1762  | 57.3M  |                         escape = false;  | 
1763  | 57.3M  |                     }  | 
1764  | 57.3M  |                     break;  | 
1765  | 96.8k  |                 case HTML_RFC2397_FINISH:  | 
1766  | 96.8k  |                     if (file_tmp_o1) { | 
1767  | 96.8k  |                         if (file_tmp_o1->fd != -1) { | 
1768  | 96.8k  |                             html_output_flush(file_tmp_o1);  | 
1769  | 96.8k  |                             close(file_tmp_o1->fd);  | 
1770  | 96.8k  |                             file_tmp_o1->fd = -1;  | 
1771  | 96.8k  |                         }  | 
1772  | 96.8k  |                         free(file_tmp_o1);  | 
1773  | 96.8k  |                         file_tmp_o1 = NULL;  | 
1774  | 96.8k  |                     }  | 
1775  | 96.8k  |                     state      = HTML_SKIP_WS;  | 
1776  | 96.8k  |                     escape     = false;  | 
1777  | 96.8k  |                     quoted     = NOT_QUOTED;  | 
1778  | 96.8k  |                     next_state = HTML_TAG_ARG;  | 
1779  | 96.8k  |                     binary     = false;  | 
1780  | 96.8k  |                     break;  | 
1781  | 414k  |                 case HTML_RFC2397_ESC:  | 
1782  | 414k  |                     if (length == 2) { | 
1783  | 52.5k  |                         html_output_c(file_tmp_o1, value);  | 
1784  | 361k  |                     } else if (length == 1) { | 
1785  | 361k  |                         html_output_c(file_tmp_o1, '%');  | 
1786  | 361k  |                         html_output_c(file_tmp_o1, value + '0');  | 
1787  | 361k  |                     } else { | 
1788  | 0  |                         html_output_c(file_tmp_o1, '%');  | 
1789  | 0  |                     }  | 
1790  | 414k  |                     state = HTML_RFC2397_DATA;  | 
1791  | 414k  |                     break;  | 
1792  | 466k  |                 case HTML_ESCAPE_CHAR:  | 
1793  | 466k  |                     if (value < INT64_MAX / 16) { | 
1794  | 466k  |                         value *= 16;  | 
1795  | 466k  |                     } else { | 
1796  | 0  |                         state      = next_state;  | 
1797  | 0  |                         next_state = HTML_BAD_STATE;  | 
1798  | 0  |                         ptr++;  | 
1799  | 0  |                         break;  | 
1800  | 0  |                     }  | 
1801  | 466k  |                     length++;  | 
1802  | 466k  |                     if (isxdigit(*ptr)) { | 
1803  | 76.6k  |                         if (isdigit(*ptr)) { | 
1804  | 63.2k  |                             value += (*ptr - '0');  | 
1805  | 63.2k  |                         } else { | 
1806  | 13.4k  |                             value += (tolower(*ptr) - 'a' + 10);  | 
1807  | 13.4k  |                         }  | 
1808  | 389k  |                     } else { | 
1809  | 389k  |                         state = next_state;  | 
1810  | 389k  |                     }  | 
1811  | 466k  |                     if (length == 2) { | 
1812  | 52.5k  |                         state = next_state;  | 
1813  | 52.5k  |                     }  | 
1814  | 466k  |                     ptr++;  | 
1815  | 466k  |                     break;  | 
1816  | 3.18G  |             }  | 
1817  | 3.18G  |         }  | 
1818  | 699k  |         if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin)  | 
1819  |  |             /* end of line, append contents now, resume on next line */  | 
1820  | 0  |             html_tag_contents_append(&contents, href_contents_begin, ptr);  | 
1821  | 699k  |         ptrend = NULL;  | 
1822  |  |  | 
1823  | 699k  |         if (js_state) { | 
1824  | 455k  |             js_process(js_state, js_begin, js_end, line, ptr, in_tag, dirname);  | 
1825  | 455k  |             js_begin = js_end = NULL;  | 
1826  | 455k  |             if (in_tag == TAG_DONT_EXTRACT) { | 
1827  | 0  |                 js_state = NULL;  | 
1828  | 0  |             }  | 
1829  | 455k  |         }  | 
1830  |  |  | 
1831  | 699k  |         if (in_tag == TAG_STYLE) { | 
1832  | 17.2k  |             if (ptr < style_begin) { | 
1833  | 210  |                 cli_dbgmsg("cli_html_normalise: style chunk size underflow\n"); | 
1834  | 210  |                 goto done;  | 
1835  | 210  |             }  | 
1836  |  |  | 
1837  | 17.0k  |             size_t chunk_size = ptr - style_begin;  | 
1838  |  |  | 
1839  | 17.0k  |             if (style_buff == NULL) { | 
1840  | 2.14k  |                 CLI_MAX_MALLOC_OR_GOTO_DONE(style_buff, chunk_size + 1);  | 
1841  | 14.9k  |             } else { | 
1842  | 14.9k  |                 CLI_MAX_REALLOC_OR_GOTO_DONE(style_buff, style_buff_size + chunk_size + 1);  | 
1843  | 14.9k  |             }  | 
1844  |  |  | 
1845  | 17.0k  |             memcpy(style_buff + style_buff_size, style_begin, chunk_size);  | 
1846  |  |  | 
1847  | 17.0k  |             style_buff_size += chunk_size;  | 
1848  | 17.0k  |             style_buff[style_buff_size] = '\0';  | 
1849  | 17.0k  |         }  | 
1850  |  |  | 
1851  | 699k  |         if (look_for_screnc && ptr_screnc) { | 
1852  |  |             /* start found, and stuff before it already processed */  | 
1853  | 243k  |             ptr        = ptr_screnc;  | 
1854  | 243k  |             ptr_screnc = NULL;  | 
1855  | 243k  |             state      = HTML_JSDECODE_LENGTH;  | 
1856  | 243k  |             next_state = HTML_BAD_STATE;  | 
1857  | 243k  |             continue;  | 
1858  | 243k  |         }  | 
1859  | 455k  |         free(line);  | 
1860  | 455k  |         ptr = line = cli_readchunk(stream_in, m_area, 8192);  | 
1861  |  |  | 
1862  | 455k  |         if (in_tag == TAG_STYLE) { | 
1863  |  |             // reset style_begin to start of the next line  | 
1864  | 14.8k  |             style_begin = line;  | 
1865  | 14.8k  |         }  | 
1866  |  |  | 
1867  | 455k  |         if (in_screnc) { | 
1868  | 56.1k  |             state      = HTML_JSDECODE_DECRYPT;  | 
1869  | 56.1k  |             next_state = HTML_BAD_STATE;  | 
1870  | 399k  |         } else if (look_for_screnc && !ptr_screnc &&  | 
1871  | 399k  |                    state != HTML_LOOKFOR_SCRENC) { | 
1872  | 127k  |             saved_next_state = next_state;  | 
1873  | 127k  |             next_state       = state;  | 
1874  | 127k  |             state            = HTML_LOOKFOR_SCRENC;  | 
1875  | 127k  |         }  | 
1876  |  |  | 
1877  | 455k  |         if (next_state == state) { | 
1878  |  |             /* safeguard against infloop */  | 
1879  | 27.6k  |             cli_dbgmsg("htmlnorm.c: next_state == state, changing next_state\n"); | 
1880  | 27.6k  |             next_state = HTML_BAD_STATE;  | 
1881  | 27.6k  |         }  | 
1882  | 455k  |     }  | 
1883  |  |  | 
1884  | 259k  |     if (style_buff != NULL) { | 
1885  |  |         // Found contents of <style> ... </style> block.  | 
1886  |  |         // Search it for images embedded in the CSS.  | 
1887  | 15.9k  |         cl_error_t ret = html_style_block_handler(ctx, (const char *)style_buff);  | 
1888  | 15.9k  |         if (CL_SUCCESS != ret) { | 
1889  | 0  |             cli_dbgmsg("Scan of image extracted from html <style> block returned: %s\n", cl_strerror(ret)); | 
1890  | 0  |             goto done;  | 
1891  | 0  |         }  | 
1892  |  |  | 
1893  | 15.9k  |         free(style_buff);  | 
1894  | 15.9k  |         style_buff = NULL;  | 
1895  | 15.9k  |     }  | 
1896  |  |  | 
1897  | 259k  |     if (dconf_entconv) { | 
1898  |  |         /* handle "unfinished" entities */  | 
1899  | 259k  |         size_t i;  | 
1900  | 259k  |         const char *normalized;  | 
1901  | 259k  |         entity_val[entity_val_length] = '\0';  | 
1902  | 259k  |         normalized                    = entity_norm(&conv, entity_val);  | 
1903  | 259k  |         if (normalized) { | 
1904  | 8.40k  |             for (i = 0; i < strlen(normalized); i++)  | 
1905  | 7.46k  |                 html_output_c(file_buff_o2, normalized[i] & 0xff);  | 
1906  | 258k  |         } else { | 
1907  | 258k  |             if (entity_val_length) { | 
1908  | 1.62k  |                 html_output_c(file_buff_o2, '&');  | 
1909  | 179k  |                 for (i = 0; i < entity_val_length; i++)  | 
1910  | 177k  |                     html_output_c(file_buff_o2, tolower(entity_val[i]));  | 
1911  | 1.62k  |             }  | 
1912  | 258k  |         }  | 
1913  | 259k  |     }  | 
1914  |  |  | 
1915  | 259k  |     retval = true;  | 
1916  |  |  | 
1917  | 259k  | done:  | 
1918  | 259k  |     if (line) /* only needed for done case */  | 
1919  | 626  |         free(line);  | 
1920  | 259k  |     if (in_form_action)  | 
1921  | 0  |         free(in_form_action);  | 
1922  | 259k  |     if (in_ahref) /* tag not closed, force closing */  | 
1923  | 0  |         html_tag_contents_done(hrefs, in_ahref, &contents);  | 
1924  |  |  | 
1925  | 259k  |     if (js_state) { | 
1926  |  |         /*  output script so far */  | 
1927  | 180k  |         cli_js_parse_done(js_state);  | 
1928  | 180k  |         cli_js_output(js_state, dirname);  | 
1929  | 180k  |         cli_js_destroy(js_state);  | 
1930  | 180k  |         js_state = NULL;  | 
1931  | 180k  |     }  | 
1932  | 259k  |     html_tag_arg_free(&tag_args);  | 
1933  | 259k  |     if (!m_area) { | 
1934  | 0  |         fclose(stream_in);  | 
1935  | 0  |     }  | 
1936  | 259k  |     if (file_buff_o2) { | 
1937  | 259k  |         html_output_flush(file_buff_o2);  | 
1938  | 259k  |         if (file_buff_o2->fd != -1)  | 
1939  | 259k  |             close(file_buff_o2->fd);  | 
1940  | 259k  |         free(file_buff_o2);  | 
1941  | 259k  |     }  | 
1942  | 259k  |     if (file_buff_text) { | 
1943  | 259k  |         html_output_flush(file_buff_text);  | 
1944  | 259k  |         if (file_buff_text->fd != -1)  | 
1945  | 259k  |             close(file_buff_text->fd);  | 
1946  | 259k  |         free(file_buff_text);  | 
1947  | 259k  |         file_buff_text = NULL;  | 
1948  | 259k  |     }  | 
1949  | 259k  |     if (file_tmp_o1) { | 
1950  | 5.33k  |         if (file_tmp_o1->fd != -1) { | 
1951  | 5.33k  |             html_output_flush(file_tmp_o1);  | 
1952  | 5.33k  |             close(file_tmp_o1->fd);  | 
1953  | 5.33k  |         }  | 
1954  | 5.33k  |         free(file_tmp_o1);  | 
1955  | 5.33k  |     }  | 
1956  | 259k  |     if (style_buff != NULL) { | 
1957  | 626  |         free(style_buff);  | 
1958  | 626  |     }  | 
1959  | 259k  |     return retval;  | 
1960  | 259k  | }  | 
1961  |  |  | 
1962  |  | bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)  | 
1963  | 0  | { | 
1964  | 0  |     m_area_t m_area;  | 
1965  |  | 
  | 
1966  | 0  |     m_area.buffer = in_buff;  | 
1967  | 0  |     m_area.length = in_size;  | 
1968  | 0  |     m_area.offset = 0;  | 
1969  | 0  |     m_area.map    = NULL;  | 
1970  |  | 
  | 
1971  | 0  |     return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);  | 
1972  | 0  | }  | 
1973  |  |  | 
1974  |  | bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)  | 
1975  | 259k  | { | 
1976  | 259k  |     bool retval = false;  | 
1977  | 259k  |     m_area_t m_area;  | 
1978  |  |  | 
1979  | 259k  |     m_area.length = map->len;  | 
1980  | 259k  |     m_area.offset = 0;  | 
1981  | 259k  |     m_area.map    = map;  | 
1982  | 259k  |     retval        = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);  | 
1983  | 259k  |     return retval;  | 
1984  | 259k  | }  | 
1985  |  |  | 
1986  |  | bool html_screnc_decode(fmap_t *map, const char *dirname)  | 
1987  | 360k  | { | 
1988  | 360k  |     int count;  | 
1989  | 360k  |     bool retval         = false;  | 
1990  | 360k  |     unsigned char *line = NULL, tmpstr[6];  | 
1991  | 360k  |     unsigned char *ptr, filename[1024];  | 
1992  | 360k  |     int ofd;  | 
1993  | 360k  |     struct screnc_state screnc_state;  | 
1994  | 360k  |     m_area_t m_area;  | 
1995  |  |  | 
1996  | 360k  |     memset(&m_area, 0, sizeof(m_area));  | 
1997  | 360k  |     m_area.length = map->len;  | 
1998  | 360k  |     m_area.offset = 0;  | 
1999  | 360k  |     m_area.map    = map;  | 
2000  |  |  | 
2001  | 360k  |     snprintf((char *)filename, 1024, "%s" PATHSEP "screnc.html", dirname);  | 
2002  | 360k  |     ofd = open((const char *)filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);  | 
2003  |  |  | 
2004  | 360k  |     if (ofd < 0) { | 
2005  | 0  |         cli_dbgmsg("open failed: %s\n", filename); | 
2006  | 0  |         return false;  | 
2007  | 0  |     }  | 
2008  |  |  | 
2009  | 360k  |     while ((line = cli_readchunk(NULL, &m_area, 8192)) != NULL) { | 
2010  | 360k  |         ptr = (unsigned char *)strstr((char *)line, "#@~^");  | 
2011  | 360k  |         if (ptr) { | 
2012  | 360k  |             break;  | 
2013  | 360k  |         }  | 
2014  | 0  |         free(line);  | 
2015  | 0  |         line = NULL;  | 
2016  | 0  |     }  | 
2017  | 360k  |     if (!line) { | 
2018  | 0  |         goto done;  | 
2019  | 0  |     }  | 
2020  |  |  | 
2021  |  |     /* Calculate the length of the encoded string */  | 
2022  | 360k  |     ptr += 4;  | 
2023  | 360k  |     count = 0;  | 
2024  | 2.71M  |     do { | 
2025  | 2.71M  |         if (!*ptr) { | 
2026  | 66.8k  |             free(line);  | 
2027  | 66.8k  |             ptr = line = cli_readchunk(NULL, &m_area, 8192);  | 
2028  | 66.8k  |             if (!line) { | 
2029  | 55.6k  |                 goto done;  | 
2030  | 55.6k  |             }  | 
2031  | 66.8k  |         }  | 
2032  | 2.66M  |         if (count < 6)  | 
2033  | 2.03M  |             tmpstr[count] = *ptr;  | 
2034  | 2.66M  |         count++;  | 
2035  | 2.66M  |         ptr++;  | 
2036  | 2.66M  |     } while (count < 8);  | 
2037  |  |  | 
2038  | 304k  |     memset(&screnc_state, 0, sizeof(screnc_state));  | 
2039  | 304k  |     screnc_state.length = base64_chars[tmpstr[0]] < 0 ? 0 : base64_chars[tmpstr[0]] << 2;  | 
2040  | 304k  |     screnc_state.length += base64_chars[tmpstr[1]] >> 4;  | 
2041  | 304k  |     screnc_state.length += (base64_chars[tmpstr[1]] & 0x0f) << 12;  | 
2042  | 304k  |     screnc_state.length += ((base64_chars[tmpstr[2]] >> 2) < 0 ? 0 : (base64_chars[tmpstr[2]] >> 2)) << 8;  | 
2043  | 304k  |     screnc_state.length += (base64_chars[tmpstr[2]] & 0x03) << 22;  | 
2044  | 304k  |     screnc_state.length += base64_chars[tmpstr[3]] < 0 ? 0 : base64_chars[tmpstr[3]] << 16;  | 
2045  | 304k  |     screnc_state.length += (base64_chars[tmpstr[4]] < 0 ? 0 : base64_chars[tmpstr[4]] << 2) << 24;  | 
2046  | 304k  |     screnc_state.length += ((base64_chars[tmpstr[5]] >> 4) < 0 ? 0 : (base64_chars[tmpstr[5]] >> 4)) << 24;  | 
2047  | 304k  |     cli_writen(ofd, "<script>", strlen("<script>")); | 
2048  | 725k  |     while (screnc_state.length && line) { | 
2049  | 420k  |         screnc_decode(ptr, &screnc_state);  | 
2050  | 420k  |         cli_writen(ofd, ptr, strlen((const char *)ptr));  | 
2051  | 420k  |         free(line);  | 
2052  | 420k  |         line = NULL;  | 
2053  | 420k  |         if (screnc_state.length) { | 
2054  | 392k  |             ptr = line = cli_readchunk(NULL, &m_area, 8192);  | 
2055  | 392k  |         }  | 
2056  | 420k  |     }  | 
2057  | 304k  |     cli_writen(ofd, "</script>", strlen("</script>")); | 
2058  | 304k  |     if (screnc_state.length)  | 
2059  | 265k  |         cli_dbgmsg("html_screnc_decode: missing %u bytes\n", screnc_state.length); | 
2060  | 304k  |     retval = true;  | 
2061  |  |  | 
2062  | 360k  | done:  | 
2063  | 360k  |     close(ofd);  | 
2064  | 360k  |     if (line) { | 
2065  | 10.9k  |         free(line);  | 
2066  | 10.9k  |     }  | 
2067  | 360k  |     return retval;  | 
2068  | 304k  | }  |