Coverage Report

Created: 2024-05-20 06:31

/src/clamav/libclamav/htmlnorm.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (C) 2013-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
3
 *  Copyright (C) 2007-2013 Sourcefire, Inc.
4
 *
5
 *  Authors: Trog
6
 *
7
 *  Summary: Normalise HTML text. Decode MS Script Encoder protection.
8
 *           The ScrEnc decoder was initially based upon an analysis by Andreas Marx.
9
 *
10
 *  This program is free software; you can redistribute it and/or modify
11
 *  it under the terms of the GNU General Public License version 2 as
12
 *  published by the Free Software Foundation.
13
 *
14
 *  This program is distributed in the hope that it will be useful,
15
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 *  GNU General Public License for more details.
18
 *
19
 *  You should have received a copy of the GNU General Public License
20
 *  along with this program; if not, write to the Free Software
21
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
22
 *  MA 02110-1301, USA.
23
 */
24
25
#if HAVE_CONFIG_H
26
#include "clamav-config.h"
27
#endif
28
29
#include <stdio.h>
30
#ifdef HAVE_UNISTD_H
31
#include <unistd.h>
32
#endif
33
#include <sys/types.h>
34
#include <sys/stat.h>
35
#include <fcntl.h>
36
#ifdef HAVE_STRINGS_H
37
#include <strings.h>
38
#endif
39
#include <string.h>
40
#include <errno.h>
41
#include <stdio.h>
42
#include <ctype.h>
43
44
#include "clamav.h"
45
#include "fmap.h"
46
#include "others.h"
47
#include "htmlnorm.h"
48
49
#include "entconv.h"
50
#include "jsparse/js-norm.h"
51
52
#include "clamav_rust.h"
53
#include "scanners.h"
54
55
328M
#define HTML_STR_LENGTH 1024
56
0
#define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
57
58
typedef enum {
59
    HTML_BAD_STATE,
60
    HTML_NORM,
61
    HTML_8BIT,
62
    HTML_COMMENT,
63
    HTML_CHAR_REF,
64
    HTML_ENTITY_REF_DECODE,
65
    HTML_SKIP_WS,
66
    HTML_TRIM_WS,
67
    HTML_TAG,
68
    HTML_TAG_ARG,
69
    HTML_TAG_ARG_VAL,
70
    HTML_TAG_ARG_EQUAL,
71
    HTML_PROCESS_TAG,
72
    HTML_CHAR_REF_DECODE,
73
    HTML_LOOKFOR_SCRENC,
74
    HTML_JSDECODE,
75
    HTML_JSDECODE_LENGTH,
76
    HTML_JSDECODE_DECRYPT,
77
    HTML_SPECIAL_CHAR,
78
    HTML_RFC2397_TYPE,
79
    HTML_RFC2397_INIT,
80
    HTML_RFC2397_DATA,
81
    HTML_RFC2397_FINISH,
82
    HTML_RFC2397_ESC,
83
    HTML_ESCAPE_CHAR
84
} html_state;
85
86
typedef enum {
87
    TAG_DONT_EXTRACT,
88
    TAG_SCRIPT,
89
    TAG_STYLE,
90
} tag_type;
91
92
typedef enum {
93
    SINGLE_QUOTED,
94
    DOUBLE_QUOTED,
95
    NOT_QUOTED
96
} quoted_state;
97
98
3.30G
#define HTML_FILE_BUFF_LEN 8192
99
100
typedef struct file_buff_tag {
101
    int fd;
102
    unsigned char buffer[HTML_FILE_BUFF_LEN];
103
    uint64_t length;
104
} file_buff_t;
105
106
struct tag_contents {
107
    size_t pos;
108
    unsigned char contents[MAX_TAG_CONTENTS_LENGTH + 1];
109
};
110
111
// clang-format off
112
static const int64_t base64_chars[256] = {
113
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
114
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
115
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
116
    52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
117
    -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
118
    15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
119
    -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
120
    41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
121
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
122
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
123
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
124
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
125
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
126
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
127
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
128
    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
129
};
130
// clang-format on
131
132
int table_order[] = {
133
    00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,
134
    00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,
135
    00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,
136
    00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02};
137
138
int decrypt_tables[3][128] = {
139
    {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
140
     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
141
     0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,
142
     0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,
143
     0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,
144
     0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,
145
     0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,
146
     0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},
147
148
    {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
149
     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
150
     0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,
151
     0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,
152
     0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,
153
     0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,
154
     0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,
155
     0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},
156
157
    {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F,
158
     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
159
     0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,
160
     0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,
161
     0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,
162
     0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,
163
     0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,
164
     0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}};
165
166
static inline unsigned int rewind_tospace(const unsigned char *chunk, unsigned int len)
167
318k
{
168
318k
    unsigned int count = len;
169
669M
    while (!isspace(chunk[len - 1]) && (len > 1)) {
170
669M
        len--;
171
669M
    }
172
318k
    if (len == 1) {
173
57.9k
        return count;
174
57.9k
    }
175
260k
    return len;
176
318k
}
177
178
/* read at most @max_len of data from @m_area or @stream, skipping NULL chars.
179
 * This used to be called cli_readline, but we don't stop at end-of-line anymore */
180
static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len)
181
1.53M
{
182
1.53M
    unsigned char *chunk, *start, *ptr, *end;
183
1.53M
    unsigned int chunk_len, count;
184
185
1.53M
    chunk = (unsigned char *)cli_max_malloc(max_len);
186
1.53M
    if (!chunk) {
187
0
        cli_errmsg("readchunk: Unable to allocate memory for chunk\n");
188
0
        return NULL;
189
0
    }
190
191
    /* Try to use the memory buffer first */
192
1.53M
    if (m_area) {
193
        /* maximum we can copy into the buffer,
194
         * we could have less than max_len bytes available */
195
1.53M
        chunk_len = MIN(m_area->length - m_area->offset, max_len - 1);
196
1.53M
        if (!chunk_len) {
197
580k
            free(chunk);
198
580k
            return NULL;
199
580k
        }
200
954k
        if (m_area->map)
201
954k
            ptr = (unsigned char *)fmap_need_off_once(m_area->map, m_area->offset, chunk_len);
202
0
        else
203
0
            ptr = m_area->buffer + m_area->offset;
204
954k
        start = ptr;
205
954k
        end   = ptr - m_area->offset + m_area->length;
206
207
954k
        if ((start >= end) || !start) {
208
0
            free(chunk);
209
0
            return NULL;
210
0
        }
211
212
        /* look for NULL chars */
213
954k
        ptr = memchr(start, 0, chunk_len);
214
954k
        if (!ptr) {
215
            /* no NULL chars found, copy all */
216
295k
            memcpy(chunk, start, chunk_len);
217
295k
            chunk[chunk_len] = '\0';
218
295k
            m_area->offset += chunk_len;
219
            /* point ptr to end of chunk,
220
             * so we can check and rewind to a space below */
221
295k
            ptr = start + chunk_len;
222
658k
        } else {
223
            /* copy portion that doesn't contain NULL chars */
224
658k
            chunk_len = ptr - start;
225
658k
            if (chunk_len < max_len) {
226
658k
                memcpy(chunk, start, chunk_len);
227
658k
            } else {
228
0
                chunk_len = 0;
229
0
                ptr       = start;
230
0
            }
231
658k
            if (m_area->map)
232
658k
                ptr = (unsigned char *)fmap_need_ptr_once(m_area->map, ptr, end - ptr);
233
658k
            if (!ptr) {
234
0
                cli_warnmsg("fmap inconsistency\n");
235
0
                ptr = end;
236
0
            }
237
            /* we have unknown number of NULL chars,
238
             * copy char-by-char and skip them */
239
2.78G
            while ((ptr < end) && (chunk_len < max_len - 1)) {
240
2.78G
                const unsigned char c = *ptr++;
241
                /* we can't use chunk_len to determine how many bytes we read, since
242
                 * we skipped chars */
243
2.78G
                if (c) {
244
2.29G
                    chunk[chunk_len++] = c;
245
2.29G
                }
246
2.78G
            }
247
658k
            m_area->offset += ptr - start;
248
658k
            chunk[chunk_len] = '\0';
249
658k
        }
250
954k
        if (ptr && ptr < end && !isspace(*ptr)) {
251
            /* we hit max_len, rewind to a space */
252
318k
            count = rewind_tospace(chunk, chunk_len);
253
318k
            if (count < chunk_len) {
254
250k
                chunk[count] = '\0';
255
250k
                m_area->offset -= chunk_len - count;
256
250k
            }
257
318k
        }
258
954k
    } else {
259
0
        if (!stream) {
260
0
            cli_dbgmsg("No HTML stream\n");
261
0
            free(chunk);
262
0
            return NULL;
263
0
        }
264
0
        chunk_len = fread(chunk, 1, max_len - 1, stream);
265
0
        if (!chunk_len || chunk_len > max_len - 1) {
266
            /* EOF, or prevent overflow */
267
0
            free(chunk);
268
0
            return NULL;
269
0
        }
270
271
        /* Look for NULL chars */
272
0
        ptr = memchr(chunk, 0, chunk_len);
273
0
        if (ptr) {
274
            /* NULL char found */
275
            /* save buffer limits */
276
0
            start = ptr;
277
0
            end   = chunk + chunk_len;
278
279
            /* start of NULL chars, we will copy non-NULL characters
280
             * to this position */
281
0
            chunk_len = ptr - chunk;
282
283
            /* find first non-NULL char */
284
0
            while ((ptr < end) && !(*ptr)) {
285
0
                ptr++;
286
0
            }
287
            /* skip over NULL chars, and move back the rest */
288
0
            while ((ptr < end) && (chunk_len < max_len - 1)) {
289
0
                const unsigned char c = *ptr++;
290
0
                if (c) {
291
0
                    chunk[chunk_len++] = c;
292
0
                }
293
0
            }
294
0
        }
295
0
        chunk[chunk_len] = '\0';
296
0
        if (chunk_len == max_len - 1) {
297
            /* rewind to a space (which includes newline) */
298
0
            count = rewind_tospace(chunk, chunk_len);
299
0
            if (count < chunk_len) {
300
0
                chunk[count] = '\0';
301
                /* seek-back to space */
302
0
                fseek(stream, -(long)(chunk_len - count), SEEK_CUR);
303
0
            }
304
0
        }
305
0
    }
306
307
954k
    return chunk;
308
1.53M
}
309
310
static void html_output_flush(file_buff_t *fbuff)
311
930k
{
312
930k
    if (fbuff && (fbuff->length > 0)) {
313
930k
        cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);
314
930k
        fbuff->length = 0;
315
930k
    }
316
930k
}
317
318
static inline void html_output_c(file_buff_t *fbuff1, unsigned char c)
319
3.30G
{
320
3.30G
    if (fbuff1) {
321
3.30G
        if (fbuff1->length == HTML_FILE_BUFF_LEN) {
322
301k
            html_output_flush(fbuff1);
323
301k
        }
324
3.30G
        fbuff1->buffer[fbuff1->length++] = c;
325
3.30G
    }
326
3.30G
}
327
328
static void html_output_str(file_buff_t *fbuff, const unsigned char *str, size_t len)
329
1.24M
{
330
1.24M
    if (fbuff) {
331
1.24M
        if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) {
332
2.49k
            html_output_flush(fbuff);
333
2.49k
        }
334
1.24M
        if (len >= HTML_FILE_BUFF_LEN) {
335
0
            html_output_flush(fbuff);
336
0
            cli_writen(fbuff->fd, str, len);
337
1.24M
        } else {
338
1.24M
            memcpy(fbuff->buffer + fbuff->length, str, len);
339
1.24M
            fbuff->length += len;
340
1.24M
        }
341
1.24M
    }
342
1.24M
}
343
344
static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag)
345
1.87M
{
346
1.87M
    int i;
347
348
3.80M
    for (i = 0; i < tags->count; i++) {
349
3.01M
        if (strcmp((const char *)tags->tag[i], tag) == 0) {
350
1.08M
            return (char *)tags->value[i];
351
1.08M
        }
352
3.01M
    }
353
793k
    return NULL;
354
1.87M
}
355
356
static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value)
357
14.7k
{
358
14.7k
    int i;
359
360
104k
    for (i = 0; i < tags->count; i++) {
361
104k
        if (strcmp((const char *)tags->tag[i], tag) == 0) {
362
14.7k
            free(tags->value[i]);
363
14.7k
            tags->value[i] = (unsigned char *)cli_safer_strdup(value);
364
14.7k
            return;
365
14.7k
        }
366
104k
    }
367
0
    return;
368
14.7k
}
369
void html_tag_arg_add(tag_arguments_t *tags,
370
                      const char *tag, char *value)
371
7.83M
{
372
7.83M
    int len, i;
373
7.83M
    tags->count++;
374
7.83M
    tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag,
375
7.83M
                                                          tags->count * sizeof(char *));
376
7.83M
    if (!tags->tag) {
377
0
        goto done;
378
0
    }
379
7.83M
    tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value,
380
7.83M
                                                            tags->count * sizeof(char *));
381
7.83M
    if (!tags->value) {
382
0
        goto done;
383
0
    }
384
7.83M
    if (tags->scanContents) {
385
0
        tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents,
386
0
                                                                   tags->count * sizeof(*tags->contents));
387
0
        if (!tags->contents) {
388
0
            goto done;
389
0
        }
390
0
        tags->contents[tags->count - 1] = NULL;
391
0
    }
392
7.83M
    tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag);
393
7.83M
    if (value) {
394
2.39M
        if (*value == '"') {
395
672k
            tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1);
396
672k
            len                          = strlen((const char *)value + 1);
397
672k
            if (len > 0) {
398
668k
                tags->value[tags->count - 1][len - 1] = '\0';
399
668k
            }
400
1.72M
        } else {
401
1.72M
            tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value);
402
1.72M
        }
403
5.44M
    } else {
404
5.44M
        tags->value[tags->count - 1] = NULL;
405
5.44M
    }
406
7.83M
    return;
407
408
0
done:
409
    /* Bad error - can't do 100% recovery */
410
0
    tags->count--;
411
0
    for (i = 0; i < tags->count; i++) {
412
0
        if (tags->tag) {
413
0
            free(tags->tag[i]);
414
0
        }
415
0
        if (tags->value) {
416
0
            free(tags->value[i]);
417
0
        }
418
0
        if (tags->contents) {
419
0
            if (tags->contents[i])
420
0
                free(tags->contents[i]);
421
0
        }
422
0
    }
423
0
    if (tags->tag) {
424
0
        free(tags->tag);
425
0
    }
426
0
    if (tags->value) {
427
0
        free(tags->value);
428
0
    }
429
0
    if (tags->contents)
430
0
        free(tags->contents);
431
0
    tags->contents = NULL;
432
0
    tags->tag = tags->value = NULL;
433
0
    tags->count             = 0;
434
0
    return;
435
7.83M
}
436
437
static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)
438
14.7k
{
439
14.7k
    int i, j, len;
440
441
14.7k
    html_output_c(fbuff, '<');
442
14.7k
    html_output_str(fbuff, (const unsigned char *)tag, strlen(tag));
443
124k
    for (i = 0; i < tags->count; i++) {
444
110k
        html_output_c(fbuff, ' ');
445
110k
        html_output_str(fbuff, tags->tag[i], strlen((const char *)tags->tag[i]));
446
110k
        if (tags->value[i]) {
447
41.8k
            html_output_str(fbuff, (const unsigned char *)"=\"", 2);
448
41.8k
            len = strlen((const char *)tags->value[i]);
449
773k
            for (j = 0; j < len; j++) {
450
731k
                html_output_c(fbuff, tolower(tags->value[i][j]));
451
731k
            }
452
41.8k
            html_output_c(fbuff, '"');
453
41.8k
        }
454
110k
    }
455
14.7k
    html_output_c(fbuff, '>');
456
14.7k
}
457
458
void html_tag_arg_free(tag_arguments_t *tags)
459
9.86M
{
460
9.86M
    int i;
461
462
17.7M
    for (i = 0; i < tags->count; i++) {
463
7.83M
        free(tags->tag[i]);
464
7.83M
        if (tags->value[i]) {
465
2.39M
            free(tags->value[i]);
466
2.39M
        }
467
7.83M
        if (tags->contents)
468
0
            if (tags->contents[i])
469
0
                free(tags->contents[i]);
470
7.83M
    }
471
9.86M
    if (tags->tag) {
472
2.52M
        free(tags->tag);
473
2.52M
    }
474
9.86M
    if (tags->value) {
475
2.52M
        free(tags->value);
476
2.52M
    }
477
9.86M
    if (tags->contents)
478
0
        free(tags->contents);
479
9.86M
    tags->contents = NULL;
480
9.86M
    tags->tag = tags->value = NULL;
481
9.86M
    tags->count             = 0;
482
9.86M
}
483
484
/**
485
 * the displayed text for an <a href> tag
486
 */
487
static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char *begin, const unsigned char *end)
488
0
{
489
0
    size_t i;
490
0
    uint32_t mbchar = 0;
491
0
    if (!begin || !end)
492
0
        return;
493
0
    for (i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end); i++) {
494
0
        uint8_t c = *begin++;
495
0
        if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
496
0
            if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||
497
0
                (mbchar == 0xA1 && (c == 0x43 || c == 0x44 || c == 0x4F))) {
498
0
                cont->contents[i++] = '.';
499
0
                if (mbchar == 0xA1) {
500
0
                    --i;
501
0
                    mbchar = 0;
502
0
                    continue;
503
0
                }
504
0
            } else {
505
0
                uint8_t c0 = mbchar >> 16;
506
0
                uint8_t c1 = (mbchar >> 8) & 0xff;
507
0
                uint8_t c2 = (mbchar & 0xff);
508
0
                if (c0 && i + 1 < MAX_TAG_CONTENTS_LENGTH)
509
0
                    cont->contents[i++] = c0;
510
0
                if ((c0 || c1) && i + 1 < MAX_TAG_CONTENTS_LENGTH)
511
0
                    cont->contents[i++] = c1;
512
0
                if (i + 1 < MAX_TAG_CONTENTS_LENGTH)
513
0
                    cont->contents[i++] = c2;
514
0
            }
515
0
            mbchar = 0;
516
0
        }
517
0
        if (c >= 0x80) {
518
0
            mbchar = (mbchar << 8) | c;
519
0
            --i;
520
0
        } else
521
0
            cont->contents[i] = c;
522
0
    }
523
0
    cont->pos = i;
524
0
}
525
526
static inline void html_tag_contents_done(tag_arguments_t *tags, int idx, struct tag_contents *cont)
527
0
{
528
0
    unsigned char *p;
529
0
    cont->contents[cont->pos++] = '\0';
530
0
    p                           = cli_max_malloc(cont->pos);
531
0
    if (!p) {
532
0
        cli_errmsg("html_tag_contents_done: Unable to allocate memory for p\n");
533
0
        return;
534
0
    }
535
0
    memcpy(p, cont->contents, cont->pos);
536
0
    tags->contents[idx - 1] = p;
537
0
    cont->pos               = 0;
538
0
}
539
540
struct screnc_state {
541
    uint32_t length;
542
    uint32_t sum;
543
    uint8_t table_pos;
544
};
545
546
/* inplace decoding, so that we can normalize it later */
547
static void screnc_decode(unsigned char *ptr, struct screnc_state *s)
548
688k
{
549
688k
    uint8_t value;
550
688k
    unsigned char *dst = ptr;
551
552
688k
    if (!ptr || !s)
553
0
        return;
554
1.11G
    while (s->length > 0 && *ptr) {
555
1.11G
        if ((*ptr == '\n') || (*ptr == '\r')) {
556
10.2M
            ptr++;
557
10.2M
            continue;
558
10.2M
        }
559
1.10G
        if (*ptr < 0x80) {
560
736M
            value = decrypt_tables[table_order[s->table_pos]][*ptr];
561
736M
            if (value == 0xFF) { /* special character */
562
6.71M
                ptr++;
563
6.71M
                s->length--;
564
6.71M
                switch (*ptr) {
565
8.63k
                    case '\0':
566
                        /* Fixup for end of line */
567
8.63k
                        ptr--;
568
8.63k
                        break;
569
61.0k
                    case 0x21:
570
61.0k
                        value = 0x3c;
571
61.0k
                        break;
572
165k
                    case 0x23:
573
165k
                        value = 0x0d;
574
165k
                        break;
575
61.0k
                    case 0x24:
576
61.0k
                        value = 0x40;
577
61.0k
                        break;
578
45.4k
                    case 0x26:
579
45.4k
                        value = 0x0a;
580
45.4k
                        break;
581
40.1k
                    case 0x2a:
582
40.1k
                        value = 0x3e;
583
40.1k
                        break;
584
6.71M
                }
585
6.71M
            }
586
736M
            s->sum += value;
587
736M
            *dst++       = value;
588
736M
            s->table_pos = (s->table_pos + 1) % 64;
589
736M
        } else {
590
365M
            *dst++ = *ptr++;
591
365M
            *dst++ = *ptr;
592
365M
            if (!*ptr) {
593
81.3k
                dst--;
594
81.3k
                break;
595
81.3k
            }
596
365M
        }
597
1.10G
        ptr++;
598
1.10G
        s->length--;
599
1.10G
    }
600
688k
    if (!s->length) {
601
238k
        size_t remaining;
602
238k
        if (strlen((const char *)ptr) >= 12) {
603
232k
            uint64_t expected;
604
232k
            expected = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;
605
232k
            expected += base64_chars[ptr[1]] >> 4;
606
232k
            expected += (base64_chars[ptr[1]] & 0x0f) << 12;
607
232k
            expected += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;
608
232k
            expected += (base64_chars[ptr[2]] & 0x03) << 22;
609
232k
            expected += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;
610
232k
            expected += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;
611
232k
            expected += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;
612
232k
            ptr += 8;
613
232k
            if (s->sum != expected) {
614
226k
                cli_dbgmsg("screnc_decode: checksum mismatch: %u != %" PRIu64 "\n", s->sum, expected);
615
226k
            } else {
616
5.73k
                if (strncmp((const char *)ptr, "^#~@", 4) != 0) {
617
4.59k
                    cli_dbgmsg("screnc_decode: terminator not found\n");
618
4.59k
                } else {
619
1.14k
                    cli_dbgmsg("screnc_decode: OK\n");
620
1.14k
                }
621
5.73k
            }
622
232k
            ptr += 4;
623
232k
        }
624
        /* copy remaining */
625
238k
        remaining = strlen((const char *)ptr) + 1;
626
238k
        memmove(dst, ptr, remaining);
627
450k
    } else {
628
450k
        *dst = '\0';
629
450k
    }
630
688k
}
631
632
static void js_process(struct parser_state *js_state, const unsigned char *js_begin, const unsigned char *js_end,
633
                       const unsigned char *line, const unsigned char *ptr, tag_type in_tag, const char *dirname)
634
640k
{
635
640k
    if (!js_begin)
636
275k
        js_begin = line;
637
640k
    if (!js_end)
638
455k
        js_end = ptr;
639
640k
    if (js_end > js_begin &&
640
640k
        CLI_ISCONTAINED(line, 8192, js_begin, 1) &&
641
640k
        CLI_ISCONTAINED(line, 8192, js_end, 1)) {
642
639k
        cli_js_process_buffer(js_state, (const char *)js_begin, js_end - js_begin);
643
639k
    }
644
640k
    if (in_tag == TAG_DONT_EXTRACT) {
645
        /*  we found a /script, normalize script now */
646
184k
        cli_js_parse_done(js_state);
647
184k
        cli_js_output(js_state, dirname);
648
184k
        cli_js_destroy(js_state);
649
184k
    }
650
640k
}
651
652
static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
653
259k
{
654
259k
    int fd_tmp, tag_length = 0, tag_arg_length = 0;
655
259k
    bool binary, retval = false, escape = false, hex = false;
656
259k
    int64_t value = 0, tag_val_length = 0;
657
259k
    bool look_for_screnc = false, in_screnc = false, text_space_written = false;
658
259k
    tag_type in_tag  = TAG_DONT_EXTRACT;
659
259k
    FILE *stream_in  = NULL;
660
259k
    html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
661
259k
    char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
662
259k
    char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
663
259k
    unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
664
259k
    tag_arguments_t tag_args;
665
259k
    quoted_state quoted  = NOT_QUOTED;
666
259k
    unsigned long length = 0;
667
259k
    struct screnc_state screnc_state;
668
259k
    file_buff_t *file_buff_o2, *file_buff_text;
669
259k
    file_buff_t *file_tmp_o1           = NULL;
670
259k
    int in_ahref                       = 0;    /* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
671
259k
    unsigned char *href_contents_begin = NULL; /*beginning of the next portion of <a> contents*/
672
259k
    unsigned char *ptrend              = NULL; /*end of <a> contents*/
673
259k
    unsigned char *in_form_action      = NULL; /* the action URL of the current <form> tag, if any*/
674
675
259k
    struct entity_conv conv;
676
259k
    unsigned char entity_val[HTML_STR_LENGTH + 1];
677
259k
    size_t entity_val_length = 0;
678
259k
    const int dconf_entconv  = dconf ? dconf->phishing & PHISHING_CONF_ENTCONV : 1;
679
259k
    const int dconf_js       = dirname && (dconf ? dconf->doc & DOC_CONF_JSNORM : 1); /* TODO */
680
    /* dconf for phishing engine sets scanContents, so no need for a flag here */
681
259k
    struct parser_state *js_state = NULL;
682
259k
    const unsigned char *js_begin = NULL, *js_end = NULL;
683
259k
    uint8_t *style_buff              = NULL;
684
259k
    size_t style_buff_size           = 0;
685
259k
    const unsigned char *style_begin = NULL, *style_end = NULL;
686
259k
    struct tag_contents contents;
687
259k
    uint32_t mbchar  = 0;
688
259k
    uint32_t mbchar2 = 0;
689
690
    /*
691
     * Initialize stack buffers.
692
     */
693
259k
    memset(filename, 0, sizeof(filename));
694
259k
    memset(tag, 0, sizeof(tag));
695
259k
    memset(tag_arg, 0, sizeof(tag_arg));
696
259k
    memset(tag_val, 0, sizeof(tag_val));
697
259k
    memset(entity_val, 0, sizeof(entity_val));
698
699
259k
    tag_args.scanContents = 0; /* do we need to store the contents of <a></a>?*/
700
259k
    contents.pos          = 0;
701
259k
    if (!m_area) {
702
0
        if (fd < 0) {
703
0
            cli_dbgmsg("Invalid HTML fd\n");
704
0
            return false;
705
0
        }
706
0
        lseek(fd, 0, SEEK_SET);
707
0
        fd_tmp = dup(fd);
708
0
        if (fd_tmp < 0) {
709
0
            return false;
710
0
        }
711
0
        stream_in = fdopen(fd_tmp, "r");
712
0
        if (!stream_in) {
713
0
            close(fd_tmp);
714
0
            return false;
715
0
        }
716
0
    }
717
718
259k
    tag_args.count    = 0;
719
259k
    tag_args.tag      = NULL;
720
259k
    tag_args.value    = NULL;
721
259k
    tag_args.contents = NULL;
722
259k
    if (dirname) {
723
259k
        file_buff_o2 = (file_buff_t *)malloc(sizeof(file_buff_t));
724
259k
        if (!file_buff_o2) {
725
0
            cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_o2\n");
726
0
            file_buff_o2 = file_buff_text = NULL;
727
0
            goto done;
728
0
        }
729
730
        /* this will still contains scripts that are inside comments */
731
259k
        snprintf(filename, 1024, "%s" PATHSEP "nocomment.html", dirname);
732
259k
        file_buff_o2->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);
733
259k
        if (file_buff_o2->fd == -1) {
734
0
            cli_dbgmsg("open failed: %s\n", filename);
735
0
            free(file_buff_o2);
736
0
            file_buff_o2 = file_buff_text = NULL;
737
0
            goto done;
738
0
        }
739
740
259k
        file_buff_text = (file_buff_t *)malloc(sizeof(file_buff_t));
741
259k
        if (!file_buff_text) {
742
0
            close(file_buff_o2->fd);
743
0
            free(file_buff_o2);
744
0
            file_buff_o2 = file_buff_text = NULL;
745
0
            cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_text\n");
746
0
            goto done;
747
0
        }
748
749
259k
        snprintf(filename, 1024, "%s" PATHSEP "notags.html", dirname);
750
259k
        file_buff_text->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);
751
259k
        if (file_buff_text->fd == -1) {
752
0
            cli_dbgmsg("open failed: %s\n", filename);
753
0
            close(file_buff_o2->fd);
754
0
            free(file_buff_o2);
755
0
            free(file_buff_text);
756
0
            file_buff_o2 = file_buff_text = NULL;
757
0
            goto done;
758
0
        }
759
259k
        file_buff_o2->length   = 0;
760
259k
        file_buff_text->length = 0;
761
259k
    } else {
762
0
        file_buff_o2   = NULL;
763
0
        file_buff_text = NULL;
764
0
    }
765
766
259k
    binary = false;
767
768
259k
    ptr = line = cli_readchunk(stream_in, m_area, 8192);
769
770
959k
    while (line) {
771
700k
        if (href_contents_begin)
772
0
            href_contents_begin = ptr; /*start of a new line, last line already appended to contents see below*/
773
1.03M
        while (*ptr && isspace(*ptr)) {
774
337k
            ptr++;
775
337k
        }
776
3.20G
        while (*ptr) {
777
3.20G
            if (!binary && *ptr == '\n') {
778
                /* Convert it to a space and re-process */
779
14.7M
                *ptr = ' ';
780
14.7M
                continue;
781
14.7M
            }
782
3.19G
            if (!binary && *ptr == '\r') {
783
5.27M
                ptr++;
784
5.27M
                continue;
785
5.27M
            }
786
3.18G
            switch (state) {
787
0
                case HTML_SPECIAL_CHAR:
788
0
                    cli_dbgmsg("Impossible, special_char can't occur here\n");
789
0
                    break;
790
0
                case HTML_BAD_STATE:
791
                    /* An engine error has occurred */
792
0
                    cli_dbgmsg("HTML Engine Error\n");
793
0
                    goto done;
794
81.7M
                case HTML_SKIP_WS:
795
81.7M
                    if (isspace(*ptr)) {
796
15.0M
                        ptr++;
797
66.6M
                    } else {
798
66.6M
                        state      = next_state;
799
66.6M
                        next_state = HTML_BAD_STATE;
800
66.6M
                    }
801
81.7M
                    break;
802
156M
                case HTML_TRIM_WS:
803
156M
                    if (isspace(*ptr)) {
804
93.7M
                        ptr++;
805
93.7M
                    } else {
806
62.6M
                        if (in_tag == TAG_DONT_EXTRACT) {
807
9.52M
                            html_output_c(file_buff_o2, ' ');
808
9.52M
                        }
809
62.6M
                        state      = next_state;
810
62.6M
                        next_state = HTML_BAD_STATE;
811
62.6M
                    }
812
156M
                    break;
813
420M
                case HTML_8BIT:
814
420M
                    if (*ptr < 0x80 || mbchar >= 0x10000) {
815
190M
                        if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||
816
190M
                            (mbchar == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
817
                            /* bb #4097 */
818
481k
                            html_output_c(file_buff_o2, '.');
819
481k
                            html_output_c(file_buff_text, '.');
820
481k
                            if (mbchar == 0xA1) {
821
330k
                                ptr++;
822
330k
                                mbchar = 0;
823
330k
                                continue;
824
330k
                            }
825
189M
                        } else {
826
189M
                            uint8_t c0 = mbchar >> 16;
827
189M
                            uint8_t c1 = (mbchar >> 8) & 0xff;
828
189M
                            uint8_t c2 = (mbchar & 0xff);
829
189M
                            if (c0) {
830
101M
                                html_output_c(file_buff_o2, c0);
831
101M
                                html_output_c(file_buff_text, c0);
832
101M
                            }
833
189M
                            if (c0 || c1) {
834
127M
                                html_output_c(file_buff_o2, c1);
835
127M
                                html_output_c(file_buff_text, c1);
836
127M
                            }
837
189M
                            html_output_c(file_buff_o2, c2);
838
189M
                            html_output_c(file_buff_text, c1);
839
189M
                        }
840
190M
                        mbchar     = 0;
841
190M
                        state      = next_state;
842
190M
                        next_state = HTML_NORM;
843
230M
                    } else {
844
230M
                        mbchar = (mbchar << 8) | *ptr;
845
230M
                        ptr++;
846
230M
                    }
847
420M
                    break;
848
1.99G
                case HTML_NORM:
849
1.99G
                    if (*ptr == '<') {
850
37.0M
                        ptrend = ptr; /* for use by scanContents */
851
37.0M
                        html_output_c(file_buff_o2, '<');
852
37.0M
                        if (in_tag == TAG_DONT_EXTRACT && !text_space_written) {
853
3.61M
                            html_output_c(file_buff_text, ' ');
854
3.61M
                            text_space_written = true;
855
3.61M
                        }
856
37.0M
                        if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
857
                            /*append this text portion to the contents of <a>*/
858
0
                            html_tag_contents_append(&contents, href_contents_begin, ptr);
859
0
                            href_contents_begin = NULL; /*We just encountered another tag inside <a>, so skip it*/
860
0
                        }
861
37.0M
                        ptr++;
862
37.0M
                        state      = HTML_SKIP_WS;
863
37.0M
                        tag_length = 0;
864
37.0M
                        next_state = HTML_TAG;
865
1.96G
                    } else if (isspace(*ptr)) {
866
62.7M
                        if (!text_space_written && in_tag == TAG_DONT_EXTRACT) {
867
8.42M
                            html_output_c(file_buff_text, ' ');
868
8.42M
                            text_space_written = true;
869
8.42M
                        }
870
62.7M
                        state      = HTML_TRIM_WS;
871
62.7M
                        next_state = HTML_NORM;
872
1.89G
                    } else if (*ptr == '&') {
873
9.78M
                        if (!text_space_written && in_tag == TAG_DONT_EXTRACT) {
874
773k
                            html_output_c(file_buff_text, ' ');
875
773k
                            text_space_written = true;
876
773k
                        }
877
9.78M
                        state      = HTML_CHAR_REF;
878
9.78M
                        next_state = HTML_NORM;
879
9.78M
                        ptr++;
880
1.89G
                    } else if (*ptr >= 0x80) {
881
190M
                        state      = HTML_8BIT;
882
190M
                        next_state = HTML_NORM;
883
190M
                        mbchar     = *ptr;
884
190M
                        ptr++;
885
1.69G
                    } else {
886
1.69G
                        unsigned char c = tolower(*ptr);
887
                        /* normalize ' to " for scripts */
888
1.69G
                        if (in_tag != TAG_DONT_EXTRACT && c == '\'') c = '"';
889
1.69G
                        html_output_c(file_buff_o2, c);
890
1.69G
                        if (in_tag == TAG_DONT_EXTRACT) {
891
238M
                            if (*ptr < 0x20) {
892
23.2M
                                if (!text_space_written) {
893
11.0M
                                    html_output_c(file_buff_text, ' ');
894
11.0M
                                    text_space_written = true;
895
11.0M
                                }
896
215M
                            } else {
897
215M
                                html_output_c(file_buff_text, c);
898
215M
                                text_space_written = false;
899
215M
                            }
900
238M
                        }
901
1.69G
                        ptr++;
902
1.69G
                    }
903
1.99G
                    break;
904
152M
                case HTML_TAG:
905
152M
                    if ((tag_length == 0) && (*ptr == '!')) {
906
                        /* Comment */
907
526k
                        if (in_tag != TAG_DONT_EXTRACT) {
908
                            /* we still write scripts to nocomment.html */
909
474k
                            html_output_c(file_buff_o2, '!');
910
474k
                        } else {
911
                            /* Need to rewind in the no-comment output stream */
912
52.3k
                            if (file_buff_o2 && (file_buff_o2->length > 0)) {
913
52.3k
                                file_buff_o2->length--;
914
52.3k
                            }
915
52.3k
                        }
916
526k
                        state      = HTML_COMMENT;
917
526k
                        next_state = HTML_BAD_STATE;
918
526k
                        ptr++;
919
152M
                    } else if (*ptr == '>') {
920
7.06M
                        html_output_c(file_buff_o2, '>');
921
7.06M
                        ptr++;
922
7.06M
                        tag[tag_length] = '\0';
923
7.06M
                        state           = HTML_SKIP_WS;
924
7.06M
                        next_state      = HTML_PROCESS_TAG;
925
144M
                    } else if (!isspace(*ptr)) {
926
142M
                        html_output_c(file_buff_o2, tolower(*ptr));
927
                        /* if we're inside a script we only care for </script>.*/
928
142M
                        if (in_tag != TAG_DONT_EXTRACT && tag_length == 0 && *ptr != '/') {
929
26.7M
                            state = HTML_NORM;
930
26.7M
                        }
931
142M
                        if (tag_length < HTML_STR_LENGTH) {
932
139M
                            tag[tag_length++] = tolower(*ptr);
933
139M
                        }
934
142M
                        ptr++;
935
142M
                    } else {
936
2.69M
                        tag[tag_length] = '\0';
937
2.69M
                        state           = HTML_SKIP_WS;
938
2.69M
                        tag_arg_length  = 0;
939
                        /* if we'd go to HTML_TAG_ARG whitespace would be inconsistently normalized for in_tag*/
940
2.69M
                        next_state = in_tag == TAG_DONT_EXTRACT ? HTML_TAG_ARG : HTML_PROCESS_TAG;
941
2.69M
                    }
942
152M
                    break;
943
119M
                case HTML_TAG_ARG:
944
119M
                    if (*ptr == '=') {
945
2.39M
                        html_output_c(file_buff_o2, '=');
946
2.39M
                        tag_arg[tag_arg_length] = '\0';
947
2.39M
                        ptr++;
948
2.39M
                        state          = HTML_SKIP_WS;
949
2.39M
                        escape         = false;
950
2.39M
                        quoted         = NOT_QUOTED;
951
2.39M
                        tag_val_length = 0;
952
2.39M
                        next_state     = HTML_TAG_ARG_VAL;
953
117M
                    } else if (isspace(*ptr)) {
954
4.35M
                        ptr++;
955
4.35M
                        tag_arg[tag_arg_length] = '\0';
956
4.35M
                        state                   = HTML_SKIP_WS;
957
4.35M
                        next_state              = HTML_TAG_ARG_EQUAL;
958
113M
                    } else if (*ptr == '>') {
959
2.50M
                        html_output_c(file_buff_o2, '>');
960
2.50M
                        if (tag_arg_length > 0) {
961
1.24M
                            tag_arg[tag_arg_length] = '\0';
962
1.24M
                            html_tag_arg_add(&tag_args, tag_arg, NULL);
963
1.24M
                        }
964
2.50M
                        ptr++;
965
2.50M
                        state      = HTML_PROCESS_TAG;
966
2.50M
                        next_state = HTML_BAD_STATE;
967
110M
                    } else {
968
110M
                        if (tag_arg_length == 0) {
969
                            /* Start of new tag - add space */
970
7.82M
                            html_output_c(file_buff_o2, ' ');
971
7.82M
                        }
972
110M
                        html_output_c(file_buff_o2, tolower(*ptr));
973
110M
                        if (tag_arg_length < HTML_STR_LENGTH) {
974
107M
                            tag_arg[tag_arg_length++] = tolower(*ptr);
975
107M
                        }
976
110M
                        ptr++;
977
110M
                    }
978
119M
                    break;
979
4.34M
                case HTML_TAG_ARG_EQUAL:
980
4.34M
                    if (*ptr == '=') {
981
151k
                        html_output_c(file_buff_o2, '=');
982
151k
                        ptr++;
983
151k
                        state          = HTML_SKIP_WS;
984
151k
                        escape         = false;
985
151k
                        quoted         = NOT_QUOTED;
986
151k
                        tag_val_length = 0;
987
151k
                        next_state     = HTML_TAG_ARG_VAL;
988
4.19M
                    } else {
989
4.19M
                        if (tag_arg_length > 0) {
990
4.19M
                            tag_arg[tag_arg_length] = '\0';
991
4.19M
                            html_tag_arg_add(&tag_args, tag_arg, NULL);
992
4.19M
                        }
993
4.19M
                        tag_arg_length = 0;
994
4.19M
                        state          = HTML_TAG_ARG;
995
4.19M
                        next_state     = HTML_BAD_STATE;
996
4.19M
                    }
997
4.34M
                    break;
998
59.0M
                case HTML_TAG_ARG_VAL:
999
59.0M
                    if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) {
1000
                        /* RFC2397 inline data */
1001
1002
                        /* Rewind one byte so we don't recursive */
1003
21.6k
                        if (file_buff_o2 && (file_buff_o2->length > 0)) {
1004
21.6k
                            file_buff_o2->length--;
1005
21.6k
                        }
1006
1007
21.6k
                        if (quoted != NOT_QUOTED) {
1008
0
                            html_output_c(file_buff_o2, '"');
1009
0
                        }
1010
21.6k
                        tag_val_length = 0;
1011
21.6k
                        state          = HTML_RFC2397_TYPE;
1012
21.6k
                        next_state     = HTML_TAG_ARG;
1013
59.0M
                    } else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) {
1014
                        /* RFC2397 inline data */
1015
1016
                        /* Rewind one byte so we don't recursive */
1017
117k
                        if (file_buff_o2 && (file_buff_o2->length > 0)) {
1018
117k
                            file_buff_o2->length--;
1019
117k
                        }
1020
1021
117k
                        if (quoted != NOT_QUOTED) {
1022
117k
                            html_output_c(file_buff_o2, '"');
1023
117k
                        }
1024
1025
117k
                        tag_val_length = 0;
1026
117k
                        state          = HTML_RFC2397_TYPE;
1027
117k
                        next_state     = HTML_TAG_ARG;
1028
58.9M
                    } else if (*ptr == '&') {
1029
1.00M
                        state      = HTML_CHAR_REF;
1030
1.00M
                        next_state = HTML_TAG_ARG_VAL;
1031
1.00M
                        ptr++;
1032
57.9M
                    } else if (*ptr == '\'') {
1033
766k
                        if (tag_val_length == 0) {
1034
287k
                            quoted = SINGLE_QUOTED;
1035
287k
                            html_output_c(file_buff_o2, '"');
1036
287k
                            if (tag_val_length < HTML_STR_LENGTH) {
1037
287k
                                tag_val[tag_val_length++] = '"';
1038
287k
                            }
1039
287k
                            ptr++;
1040
479k
                        } else {
1041
479k
                            if (!escape && (quoted == SINGLE_QUOTED)) {
1042
121k
                                html_output_c(file_buff_o2, '"');
1043
121k
                                if (tag_val_length < HTML_STR_LENGTH) {
1044
120k
                                    tag_val[tag_val_length++] = '"';
1045
120k
                                }
1046
121k
                                tag_val[tag_val_length] = '\0';
1047
121k
                                html_tag_arg_add(&tag_args, tag_arg, tag_val);
1048
121k
                                ptr++;
1049
121k
                                state          = HTML_SKIP_WS;
1050
121k
                                tag_arg_length = 0;
1051
121k
                                next_state     = HTML_TAG_ARG;
1052
357k
                            } else {
1053
357k
                                html_output_c(file_buff_o2, '"');
1054
357k
                                if (tag_val_length < HTML_STR_LENGTH) {
1055
320k
                                    tag_val[tag_val_length++] = '"';
1056
320k
                                }
1057
357k
                                ptr++;
1058
357k
                            }
1059
479k
                        }
1060
57.1M
                    } else if (*ptr == '"') {
1061
1.03M
                        if (tag_val_length == 0) {
1062
508k
                            quoted = DOUBLE_QUOTED;
1063
508k
                            html_output_c(file_buff_o2, '"');
1064
508k
                            if (tag_val_length < HTML_STR_LENGTH) {
1065
508k
                                tag_val[tag_val_length++] = '"';
1066
508k
                            }
1067
508k
                            ptr++;
1068
527k
                        } else {
1069
527k
                            if (!escape && (quoted == DOUBLE_QUOTED)) {
1070
160k
                                html_output_c(file_buff_o2, '"');
1071
160k
                                if (tag_val_length < HTML_STR_LENGTH) {
1072
159k
                                    tag_val[tag_val_length++] = '"';
1073
159k
                                }
1074
160k
                                tag_val[tag_val_length] = '\0';
1075
160k
                                html_tag_arg_add(&tag_args, tag_arg, tag_val);
1076
160k
                                ptr++;
1077
160k
                                state          = HTML_SKIP_WS;
1078
160k
                                tag_arg_length = 0;
1079
160k
                                next_state     = HTML_TAG_ARG;
1080
366k
                            } else {
1081
366k
                                html_output_c(file_buff_o2, '"');
1082
366k
                                if (tag_val_length < HTML_STR_LENGTH) {
1083
363k
                                    tag_val[tag_val_length++] = '"';
1084
363k
                                }
1085
366k
                                ptr++;
1086
366k
                            }
1087
527k
                        }
1088
56.1M
                    } else if (isspace(*ptr) || (*ptr == '>')) {
1089
2.50M
                        if (quoted == NOT_QUOTED) {
1090
2.11M
                            tag_val[tag_val_length] = '\0';
1091
2.11M
                            html_tag_arg_add(&tag_args, tag_arg, tag_val);
1092
2.11M
                            state          = HTML_SKIP_WS;
1093
2.11M
                            tag_arg_length = 0;
1094
2.11M
                            next_state     = HTML_TAG_ARG;
1095
2.11M
                        } else {
1096
395k
                            html_output_c(file_buff_o2, *ptr);
1097
395k
                            if (tag_val_length < HTML_STR_LENGTH) {
1098
393k
                                if (isspace(*ptr)) {
1099
107k
                                    tag_val[tag_val_length++] = ' ';
1100
285k
                                } else {
1101
285k
                                    tag_val[tag_val_length++] = '>';
1102
285k
                                }
1103
393k
                            }
1104
395k
                            state      = HTML_SKIP_WS;
1105
395k
                            escape     = false;
1106
395k
                            quoted     = NOT_QUOTED;
1107
395k
                            next_state = HTML_TAG_ARG_VAL;
1108
395k
                            ptr++;
1109
395k
                        }
1110
53.6M
                    } else {
1111
53.6M
                        if (mbchar2 && (*ptr < 0x80 || mbchar2 >= 0x10000)) {
1112
5.67M
                            if (mbchar2 == 0xE38082 || mbchar2 == 0xEFBC8E || mbchar2 == 0xEFB992 ||
1113
5.67M
                                (mbchar2 == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
1114
174k
                                html_output_c(file_buff_o2, '.');
1115
174k
                                if (tag_val_length < HTML_STR_LENGTH)
1116
172k
                                    tag_val[tag_val_length++] = '.';
1117
174k
                                if (mbchar2 == 0xA1) {
1118
149k
                                    ptr++;
1119
149k
                                    mbchar2 = 0;
1120
149k
                                    continue;
1121
149k
                                }
1122
5.50M
                            } else {
1123
5.50M
                                uint8_t c0 = mbchar2 >> 16;
1124
5.50M
                                uint8_t c1 = (mbchar2 >> 8) & 0xff;
1125
5.50M
                                uint8_t c2 = (mbchar2 & 0xff);
1126
5.50M
                                if (c0)
1127
3.12M
                                    html_output_c(file_buff_o2, c0);
1128
5.50M
                                if (c0 || c1)
1129
3.69M
                                    html_output_c(file_buff_o2, c1);
1130
5.50M
                                html_output_c(file_buff_o2, c2);
1131
5.50M
                                if (c0 && tag_val_length < HTML_STR_LENGTH)
1132
2.72M
                                    tag_val[tag_val_length++] = c0;
1133
5.50M
                                if ((c0 || c1) && tag_val_length < HTML_STR_LENGTH)
1134
3.28M
                                    tag_val[tag_val_length++] = c1;
1135
5.50M
                                if (tag_val_length < HTML_STR_LENGTH)
1136
5.06M
                                    tag_val[tag_val_length++] = c2;
1137
5.50M
                            }
1138
5.52M
                            mbchar2 = 0;
1139
5.52M
                        }
1140
53.4M
                        if (*ptr >= 0x80)
1141
12.5M
                            mbchar2 = (mbchar2 << 8) | *ptr;
1142
40.8M
                        else {
1143
40.8M
                            html_output_c(file_buff_o2, tolower(*ptr));
1144
40.8M
                            if (tag_val_length < HTML_STR_LENGTH) {
1145
37.9M
                                tag_val[tag_val_length++] = *ptr;
1146
37.9M
                            }
1147
40.8M
                        }
1148
53.4M
                        ptr++;
1149
53.4M
                    }
1150
1151
58.9M
                    if (*ptr == '\\') {
1152
68.1k
                        escape = true;
1153
58.8M
                    } else {
1154
58.8M
                        escape = false;
1155
58.8M
                    }
1156
58.9M
                    break;
1157
70.5M
                case HTML_COMMENT:
1158
70.5M
                    if (in_tag != TAG_DONT_EXTRACT && !isspace(*ptr)) {
1159
60.3M
                        unsigned char c = tolower(*ptr);
1160
                        /* dump script to nocomment.html, since we no longer have
1161
                         * comment.html/script.html */
1162
60.3M
                        if (c == '\'') c = '"';
1163
60.3M
                        html_output_c(file_buff_o2, c);
1164
60.3M
                    }
1165
70.5M
                    if (*ptr == '>') {
1166
520k
                        state      = HTML_SKIP_WS;
1167
520k
                        next_state = HTML_NORM;
1168
520k
                    }
1169
70.5M
                    ptr++;
1170
70.5M
                    break;
1171
9.60M
                case HTML_PROCESS_TAG:
1172
1173
                    /* Default to no action for this tag */
1174
9.60M
                    state      = HTML_SKIP_WS;
1175
9.60M
                    next_state = HTML_NORM;
1176
9.60M
                    if (tag[0] == '/') {
1177
                        /* End tag */
1178
2.22M
                        state      = HTML_SKIP_WS;
1179
2.22M
                        next_state = HTML_NORM;
1180
1181
2.22M
                        if (strcmp(tag, "/script") == 0) {
1182
667k
                            in_tag = TAG_DONT_EXTRACT;
1183
667k
                            if (js_state) {
1184
184k
                                js_end = ptr;
1185
184k
                                js_process(js_state, js_begin, js_end, line, ptr, in_tag, dirname);
1186
184k
                                js_state = NULL;
1187
184k
                                js_begin = js_end = NULL;
1188
184k
                            }
1189
                            /*don't output newlines in nocomment.html
1190
                             * html_output_c(file_buff_o2, '\n');*/
1191
1.55M
                        } else if ((strcmp(tag, "/style") == 0) && (in_tag == TAG_STYLE)) {
1192
19.8k
                            size_t chunk_size;
1193
1194
19.8k
                            style_end = ptr - strlen("</style>");
1195
1196
19.8k
                            if (style_end < style_begin) {
1197
416
                                cli_dbgmsg("cli_html_normalise: style chunk size underflow\n");
1198
416
                                goto done;
1199
416
                            }
1200
1201
19.4k
                            chunk_size = style_end - style_begin;
1202
1203
19.4k
                            if (style_buff == NULL) {
1204
14.4k
                                CLI_MAX_MALLOC_OR_GOTO_DONE(style_buff, chunk_size + 1);
1205
14.4k
                            } else {
1206
5.03k
                                CLI_MAX_REALLOC_OR_GOTO_DONE(style_buff, style_buff_size + chunk_size + 1);
1207
5.03k
                            }
1208
1209
19.4k
                            memcpy(style_buff + style_buff_size, style_begin, chunk_size);
1210
1211
19.4k
                            style_buff_size += chunk_size;
1212
19.4k
                            style_buff[style_buff_size] = '\0';
1213
1214
19.4k
                            in_tag      = TAG_DONT_EXTRACT;
1215
19.4k
                            style_begin = style_end = NULL;
1216
19.4k
                        }
1217
1218
2.22M
                        if (hrefs && hrefs->scanContents && in_ahref) {
1219
0
                            if (strcmp(tag, "/a") == 0) {
1220
0
                                html_tag_contents_done(hrefs, in_ahref, &contents);
1221
0
                                in_ahref = 0; /* we are no longer inside an <a href>
1222
                                                        nesting <a> tags not supported, and shouldn't be supported*/
1223
0
                            }
1224
0
                            href_contents_begin = ptr;
1225
0
                        }
1226
2.22M
                        if (strcmp(tag, "/form") == 0) {
1227
790
                            if (in_form_action)
1228
0
                                free(in_form_action);
1229
790
                            in_form_action = NULL;
1230
790
                        }
1231
7.38M
                    } else if (strcmp(tag, "script") == 0) {
1232
364k
                        arg_value = html_tag_arg_value(&tag_args, "language");
1233
                        /* TODO: maybe we can output all tags only via html_output_tag */
1234
364k
                        if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0)) {
1235
7.10k
                            html_tag_arg_set(&tag_args, "language", "javascript");
1236
7.10k
                            state      = HTML_SKIP_WS;
1237
7.10k
                            next_state = HTML_JSDECODE;
1238
                            /* we already output the old tag, output the new tag now */
1239
7.10k
                            html_output_tag(file_buff_o2, tag, &tag_args);
1240
357k
                        } else if (arg_value && (strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) {
1241
7.65k
                            html_tag_arg_set(&tag_args, "language", "vbscript");
1242
7.65k
                            state      = HTML_SKIP_WS;
1243
7.65k
                            next_state = HTML_JSDECODE;
1244
                            /* we already output the old tag, output the new tag now */
1245
7.65k
                            html_output_tag(file_buff_o2, tag, &tag_args);
1246
7.65k
                        }
1247
364k
                        in_tag = TAG_SCRIPT;
1248
364k
                        if (dconf_js && !js_state) {
1249
364k
                            js_state = cli_js_init();
1250
364k
                            if (!js_state) {
1251
0
                                cli_dbgmsg("htmlnorm: Failed to initialize js parser\n");
1252
0
                            }
1253
364k
                            js_begin = ptr;
1254
364k
                            js_end   = NULL;
1255
364k
                        }
1256
7.01M
                    } else if (strcmp(tag, "style") == 0) {
1257
35.7k
                        in_tag      = TAG_STYLE;
1258
35.7k
                        style_begin = ptr;
1259
35.7k
                        style_end   = NULL;
1260
6.98M
                    } else if (strcmp(tag, "%@") == 0) {
1261
675k
                        arg_value = html_tag_arg_value(&tag_args, "language");
1262
675k
                        if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0 ||
1263
486k
                                          strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) {
1264
1265
202k
                            saved_next_state = next_state;
1266
202k
                            next_state       = state;
1267
202k
                            look_for_screnc  = false;
1268
202k
                            state            = HTML_LOOKFOR_SCRENC;
1269
202k
                        }
1270
6.30M
                    } else if (hrefs) {
1271
0
                        if (in_ahref && !href_contents_begin)
1272
0
                            href_contents_begin = ptr;
1273
0
                        if (strcmp(tag, "a") == 0) {
1274
0
                            arg_value = html_tag_arg_value(&tag_args, "href");
1275
0
                            if (arg_value && strlen((const char *)arg_value) > 0) {
1276
0
                                if (hrefs->scanContents) {
1277
0
                                    char *arg_value_title = html_tag_arg_value(&tag_args, "title");
1278
                                    /*beginning of an <a> tag*/
1279
0
                                    if (in_ahref)
1280
                                        /*we encountered nested <a> tags, pretend previous closed*/
1281
0
                                        if (href_contents_begin) {
1282
0
                                            html_tag_contents_append(&contents, href_contents_begin, ptrend);
1283
                                            /*add pending contents between tags*/
1284
0
                                            html_tag_contents_done(hrefs, in_ahref, &contents);
1285
0
                                            in_ahref = 0;
1286
0
                                        }
1287
0
                                    if (arg_value_title) {
1288
                                        /* title is a 'displayed link'*/
1289
0
                                        html_tag_arg_add(hrefs, "href_title", arg_value_title);
1290
0
                                        html_tag_contents_append(&contents, (const unsigned char *)arg_value,
1291
0
                                                                 (const unsigned char *)arg_value + strlen(arg_value));
1292
0
                                        html_tag_contents_done(hrefs, hrefs->count, &contents);
1293
0
                                    }
1294
0
                                    if (in_form_action) {
1295
                                        /* form action is the real URL, and href is the 'displayed' */
1296
0
                                        html_tag_arg_add(hrefs, "form", arg_value);
1297
0
                                        contents.pos = 0;
1298
0
                                        html_tag_contents_append(&contents, in_form_action,
1299
0
                                                                 in_form_action + strlen((const char *)in_form_action));
1300
0
                                        html_tag_contents_done(hrefs, hrefs->count, &contents);
1301
0
                                    }
1302
0
                                }
1303
0
                                html_tag_arg_add(hrefs, "href", arg_value);
1304
0
                                if (hrefs->scanContents) {
1305
0
                                    in_ahref            = hrefs->count; /* index of this tag (counted from 1) */
1306
0
                                    href_contents_begin = ptr;          /* contents begin after <a ..> ends */
1307
0
                                    contents.pos        = 0;
1308
0
                                }
1309
0
                            }
1310
0
                        } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
1311
0
                            const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
1312
0
                            if (arg_action_value) {
1313
0
                                if (in_form_action)
1314
0
                                    free(in_form_action);
1315
0
                                in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);
1316
0
                            }
1317
0
                        } else if (strcmp(tag, "img") == 0) {
1318
0
                            arg_value = html_tag_arg_value(&tag_args, "src");
1319
0
                            if (arg_value && strlen(arg_value) > 0) {
1320
0
                                html_tag_arg_add(hrefs, "src", arg_value);
1321
0
                                if (hrefs->scanContents && in_ahref)
1322
                                    /* "contents" of an img tag, is the URL of its parent <a> tag */
1323
0
                                    hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);
1324
0
                                if (in_form_action) {
1325
                                    /* form action is the real URL, and href is the 'displayed' */
1326
0
                                    html_tag_arg_add(hrefs, "form", arg_value);
1327
0
                                    contents.pos = 0;
1328
0
                                    html_tag_contents_append(&contents, in_form_action,
1329
0
                                                             in_form_action + strlen((const char *)in_form_action));
1330
0
                                    html_tag_contents_done(hrefs, hrefs->count, &contents);
1331
0
                                }
1332
0
                            }
1333
0
                            arg_value = html_tag_arg_value(&tag_args, "dynsrc");
1334
0
                            if (arg_value && strlen(arg_value) > 0) {
1335
0
                                html_tag_arg_add(hrefs, "dynsrc", arg_value);
1336
0
                                if (hrefs->scanContents && in_ahref)
1337
                                    /* see above */
1338
0
                                    hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);
1339
0
                                if (in_form_action) {
1340
                                    /* form action is the real URL, and href is the 'displayed' */
1341
0
                                    html_tag_arg_add(hrefs, "form", arg_value);
1342
0
                                    contents.pos = 0;
1343
0
                                    html_tag_contents_append(&contents, in_form_action,
1344
0
                                                             in_form_action + strlen((const char *)in_form_action));
1345
0
                                    html_tag_contents_done(hrefs, hrefs->count, &contents);
1346
0
                                }
1347
0
                            }
1348
0
                        } else if (strcmp(tag, "iframe") == 0) {
1349
0
                            arg_value = html_tag_arg_value(&tag_args, "src");
1350
0
                            if (arg_value && strlen(arg_value) > 0) {
1351
0
                                html_tag_arg_add(hrefs, "iframe", arg_value);
1352
0
                                if (hrefs->scanContents && in_ahref)
1353
                                    /* see above */
1354
0
                                    hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);
1355
0
                                if (in_form_action) {
1356
                                    /* form action is the real URL, and href is the 'displayed' */
1357
0
                                    html_tag_arg_add(hrefs, "form", arg_value);
1358
0
                                    contents.pos = 0;
1359
0
                                    html_tag_contents_append(&contents, in_form_action,
1360
0
                                                             in_form_action + strlen((const char *)in_form_action));
1361
0
                                    html_tag_contents_done(hrefs, hrefs->count, &contents);
1362
0
                                }
1363
0
                            }
1364
0
                        } else if (strcmp(tag, "area") == 0) {
1365
0
                            arg_value = html_tag_arg_value(&tag_args, "href");
1366
0
                            if (arg_value && strlen(arg_value) > 0) {
1367
0
                                html_tag_arg_add(hrefs, "area", arg_value);
1368
0
                                if (hrefs->scanContents && in_ahref)
1369
                                    /* see above */
1370
0
                                    hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_safer_strdup((const char *)hrefs->value[in_ahref - 1]);
1371
0
                                if (in_form_action) {
1372
                                    /* form action is the real URL, and href is the 'displayed' */
1373
0
                                    html_tag_arg_add(hrefs, "form", arg_value);
1374
0
                                    contents.pos = 0;
1375
0
                                    html_tag_contents_append(&contents, in_form_action,
1376
0
                                                             in_form_action + strlen((const char *)in_form_action));
1377
0
                                    html_tag_contents_done(hrefs, hrefs->count, &contents);
1378
0
                                }
1379
0
                            }
1380
0
                        }
1381
                        /* TODO:imagemaps can have urls too */
1382
6.30M
                    } else if (strcmp(tag, "a") == 0) {
1383
                        /* a/img tags for buff_text can be processed only if we're not processing hrefs */
1384
744k
                        arg_value = html_tag_arg_value(&tag_args, "href");
1385
744k
                        if (arg_value && arg_value[0]) {
1386
539k
                            html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));
1387
539k
                            html_output_c(file_buff_text, ' ');
1388
539k
                            text_space_written = true;
1389
539k
                        }
1390
5.56M
                    } else if (strcmp(tag, "img") == 0) {
1391
94.2k
                        arg_value = html_tag_arg_value(&tag_args, "src");
1392
94.2k
                        if (arg_value && arg_value[0]) {
1393
15.9k
                            html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));
1394
15.9k
                            html_output_c(file_buff_text, ' ');
1395
15.9k
                            text_space_written = true;
1396
15.9k
                        }
1397
94.2k
                    }
1398
9.60M
                    html_tag_arg_free(&tag_args);
1399
9.60M
                    break;
1400
10.8M
                case HTML_CHAR_REF:
1401
10.8M
                    if (*ptr == '#') {
1402
2.96M
                        value = 0;
1403
2.96M
                        hex   = false;
1404
2.96M
                        state = HTML_CHAR_REF_DECODE;
1405
2.96M
                        ptr++;
1406
7.89M
                    } else {
1407
7.89M
                        if (dconf_entconv)
1408
7.89M
                            state = HTML_ENTITY_REF_DECODE;
1409
0
                        else {
1410
0
                            if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1411
0
                                tag_val[tag_val_length++] = '&';
1412
0
                            }
1413
0
                            html_output_c(file_buff_o2, '&');
1414
1415
0
                            state      = next_state;
1416
0
                            next_state = HTML_BAD_STATE;
1417
0
                        }
1418
7.89M
                    }
1419
10.8M
                    break;
1420
19.7M
                case HTML_ENTITY_REF_DECODE:
1421
19.7M
                    if (*ptr == ';') {
1422
1.14M
                        size_t i;
1423
1.14M
                        const char *normalized;
1424
1.14M
                        entity_val[entity_val_length] = '\0';
1425
1.14M
                        normalized                    = entity_norm(&conv, entity_val);
1426
1.14M
                        if (normalized) {
1427
527k
                            for (i = 0; i < strlen(normalized); i++) {
1428
425k
                                const unsigned char c = normalized[i] & 0xff;
1429
425k
                                html_output_c(file_buff_o2, c);
1430
425k
                                if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1431
59.3k
                                    tag_val[tag_val_length++] = c;
1432
59.3k
                                }
1433
425k
                            }
1434
1.04M
                        } else {
1435
1.04M
                            html_output_c(file_buff_o2, '&');
1436
1.04M
                            if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1437
270k
                                tag_val[tag_val_length++] = '&';
1438
270k
                            }
1439
2.57M
                            for (i = 0; i < entity_val_length; i++) {
1440
1.52M
                                const char c = tolower(entity_val[i]);
1441
1.52M
                                html_output_c(file_buff_o2, c);
1442
1.52M
                                if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1443
290k
                                    tag_val[tag_val_length++] = c;
1444
290k
                                }
1445
1.52M
                            }
1446
1.04M
                            if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1447
270k
                                tag_val[tag_val_length++] = ';';
1448
270k
                            }
1449
1.04M
                            html_output_c(file_buff_o2, ';');
1450
1.04M
                        }
1451
1.14M
                        entity_val_length = 0;
1452
1.14M
                        state             = next_state;
1453
1.14M
                        next_state        = HTML_BAD_STATE;
1454
1.14M
                        ptr++;
1455
18.6M
                    } else if ((isalnum(*ptr) || *ptr == '_' || *ptr == ':' || (*ptr == '-')) && entity_val_length < HTML_STR_LENGTH) {
1456
11.8M
                        entity_val[entity_val_length++] = *ptr++;
1457
11.8M
                    } else {
1458
                        /* entity too long, or not valid, dump it */
1459
6.74M
                        size_t i;
1460
6.74M
                        if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1461
468k
                            tag_val[tag_val_length++] = '&';
1462
468k
                        }
1463
6.74M
                        html_output_c(file_buff_o2, '&');
1464
16.5M
                        for (i = 0; i < entity_val_length; i++) {
1465
9.75M
                            const char c = tolower(entity_val[i]);
1466
9.75M
                            html_output_c(file_buff_o2, c);
1467
9.75M
                            if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1468
836k
                                tag_val[tag_val_length++] = c;
1469
836k
                            }
1470
9.75M
                        }
1471
1472
6.74M
                        state             = next_state;
1473
6.74M
                        next_state        = HTML_BAD_STATE;
1474
6.74M
                        entity_val_length = 0;
1475
6.74M
                    }
1476
19.7M
                    break;
1477
9.67M
                case HTML_CHAR_REF_DECODE:
1478
9.67M
                    if ((value == 0) && ((*ptr == 'x') || (*ptr == 'X'))) {
1479
217k
                        hex = true;
1480
217k
                        ptr++;
1481
9.46M
                    } else if (*ptr == ';') {
1482
2.02M
                        if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1483
164k
                            tag_val[tag_val_length++] = value; /* store encoded values too */
1484
164k
                        }
1485
2.02M
                        if (dconf_entconv) {
1486
1487
2.02M
                            if (value < 0x80)
1488
1.82M
                                html_output_c(file_buff_o2, tolower(value));
1489
197k
                            else {
1490
197k
                                unsigned char buff[10];
1491
197k
                                unsigned char *out = u16_normalize_tobuffer(value, buff, 10);
1492
197k
                                if (out && out > buff) {
1493
197k
                                    html_output_str(file_buff_o2, buff, out - buff - 1);
1494
197k
                                }
1495
197k
                            }
1496
2.02M
                        } else
1497
0
                            html_output_c(file_buff_o2, tolower(value & 0xff));
1498
2.02M
                        state      = next_state;
1499
2.02M
                        next_state = HTML_BAD_STATE;
1500
2.02M
                        ptr++;
1501
7.44M
                    } else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) {
1502
6.52M
                        int64_t increment = 0;
1503
1504
6.52M
                        if (hex && value < INT64_MAX / 16) {
1505
269k
                            value *= 16;
1506
6.25M
                        } else if (value < INT64_MAX / 10) {
1507
6.23M
                            value *= 10;
1508
6.23M
                        } else {
1509
27.6k
                            html_output_c(file_buff_o2, value);
1510
27.6k
                            state      = next_state;
1511
27.6k
                            next_state = HTML_BAD_STATE;
1512
27.6k
                            ptr++;
1513
27.6k
                            break;
1514
27.6k
                        }
1515
6.50M
                        if (isdigit(*ptr)) {
1516
6.39M
                            increment = *ptr - '0';
1517
6.39M
                        } else {
1518
104k
                            increment = tolower(*ptr) - 'a' + 10;
1519
104k
                        }
1520
6.50M
                        if (value > INT64_MAX - increment) {
1521
                            /* Addition would result in integer overflow. */
1522
0
                            html_output_c(file_buff_o2, value);
1523
0
                            state      = next_state;
1524
0
                            next_state = HTML_BAD_STATE;
1525
0
                            ptr++;
1526
0
                            break;
1527
0
                        }
1528
6.50M
                        value += increment;
1529
6.50M
                        ptr++;
1530
6.50M
                    } else {
1531
912k
                        html_output_c(file_buff_o2, value);
1532
912k
                        state      = next_state;
1533
912k
                        next_state = HTML_BAD_STATE;
1534
912k
                    }
1535
9.65M
                    break;
1536
9.65M
                case HTML_LOOKFOR_SCRENC:
1537
278k
                    look_for_screnc = true;
1538
278k
                    ptr_screnc      = (unsigned char *)strstr((char *)ptr, "#@~^");
1539
278k
                    if (ptr_screnc) {
1540
263k
                        ptr_screnc[0] = '/';
1541
263k
                        ptr_screnc[1] = '/';
1542
263k
                        ptr_screnc += 4;
1543
263k
                    }
1544
278k
                    state      = next_state;
1545
278k
                    next_state = saved_next_state;
1546
278k
                    break;
1547
7.07M
                case HTML_JSDECODE:
1548
                    /* Check for start marker */
1549
7.07M
                    if (strncmp((const char *)ptr, "#@~^", 4) == 0) {
1550
4.94k
                        ptr[0] = '/';
1551
4.94k
                        ptr[1] = '/';
1552
4.94k
                        ptr += 4;
1553
4.94k
                        state      = HTML_JSDECODE_LENGTH;
1554
4.94k
                        next_state = HTML_BAD_STATE;
1555
7.06M
                    } else {
1556
7.06M
                        html_output_c(file_buff_o2, tolower(*ptr));
1557
7.06M
                        ptr++;
1558
7.06M
                    }
1559
7.07M
                    break;
1560
248k
                case HTML_JSDECODE_LENGTH:
1561
248k
                    if (strlen((const char *)ptr) < 8) {
1562
837
                        state      = HTML_NORM;
1563
837
                        next_state = HTML_BAD_STATE;
1564
837
                        break;
1565
837
                    }
1566
247k
                    memset(&screnc_state, 0, sizeof(screnc_state));
1567
247k
                    screnc_state.length = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;
1568
247k
                    screnc_state.length += base64_chars[ptr[1]] >> 4;
1569
247k
                    screnc_state.length += (base64_chars[ptr[1]] & 0x0f) << 12;
1570
247k
                    screnc_state.length += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;
1571
247k
                    screnc_state.length += (base64_chars[ptr[2]] & 0x03) << 22;
1572
247k
                    screnc_state.length += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;
1573
247k
                    screnc_state.length += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;
1574
247k
                    screnc_state.length += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;
1575
247k
                    state      = HTML_JSDECODE_DECRYPT;
1576
247k
                    in_screnc  = true;
1577
247k
                    next_state = HTML_BAD_STATE;
1578
                    /* for JS normalizer */
1579
247k
                    ptr[7] = '\n';
1580
247k
                    ptr += 8;
1581
247k
                    break;
1582
268k
                case HTML_JSDECODE_DECRYPT:
1583
268k
                    screnc_decode(ptr, &screnc_state);
1584
268k
                    if (!screnc_state.length) {
1585
210k
                        state      = HTML_NORM;
1586
210k
                        next_state = HTML_BAD_STATE;
1587
210k
                        in_screnc  = false;
1588
210k
                        break;
1589
210k
                    } else {
1590
57.9k
                        state      = HTML_NORM;
1591
57.9k
                        next_state = HTML_BAD_STATE;
1592
57.9k
                    }
1593
57.9k
                    break;
1594
5.79M
                case HTML_RFC2397_TYPE:
1595
5.79M
                    if (*ptr == '\'') {
1596
121k
                        if (!escape && (quoted == SINGLE_QUOTED)) {
1597
                            /* Early end of data detected. Error */
1598
3.91k
                            ptr++;
1599
3.91k
                            state          = HTML_SKIP_WS;
1600
3.91k
                            tag_arg_length = 0;
1601
3.91k
                            next_state     = HTML_TAG_ARG;
1602
117k
                        } else {
1603
117k
                            if (tag_val_length < HTML_STR_LENGTH) {
1604
106k
                                tag_val[tag_val_length++] = '"';
1605
106k
                            }
1606
117k
                            ptr++;
1607
117k
                        }
1608
5.67M
                    } else if (*ptr == '"') {
1609
1.22M
                        if (!escape && (quoted == DOUBLE_QUOTED)) {
1610
                            /* Early end of data detected. Error */
1611
4.84k
                            ptr++;
1612
4.84k
                            state          = HTML_SKIP_WS;
1613
4.84k
                            tag_arg_length = 0;
1614
4.84k
                            next_state     = HTML_TAG_ARG;
1615
1.22M
                        } else {
1616
1.22M
                            if (tag_val_length < HTML_STR_LENGTH) {
1617
447k
                                tag_val[tag_val_length++] = '"';
1618
447k
                            }
1619
1.22M
                            ptr++;
1620
1.22M
                        }
1621
4.44M
                    } else if (isspace(*ptr) || (*ptr == '>')) {
1622
45.4k
                        if (quoted == NOT_QUOTED) {
1623
                            /* Early end of data detected. Error */
1624
22.0k
                            state          = HTML_SKIP_WS;
1625
22.0k
                            tag_arg_length = 0;
1626
22.0k
                            next_state     = HTML_TAG_ARG;
1627
23.3k
                        } else {
1628
23.3k
                            if (tag_val_length < HTML_STR_LENGTH) {
1629
22.4k
                                if (isspace(*ptr)) {
1630
5.96k
                                    tag_val[tag_val_length++] = ' ';
1631
16.5k
                                } else {
1632
16.5k
                                    tag_val[tag_val_length++] = '>';
1633
16.5k
                                }
1634
22.4k
                            }
1635
23.3k
                            state      = HTML_SKIP_WS;
1636
23.3k
                            escape     = false;
1637
23.3k
                            quoted     = NOT_QUOTED;
1638
23.3k
                            next_state = HTML_RFC2397_TYPE;
1639
23.3k
                            ptr++;
1640
23.3k
                        }
1641
4.39M
                    } else if (*ptr == ',') {
1642
                        /* Beginning of data */
1643
107k
                        tag_val[tag_val_length] = '\0';
1644
107k
                        state                   = HTML_RFC2397_INIT;
1645
107k
                        escape                  = false;
1646
107k
                        next_state              = HTML_BAD_STATE;
1647
107k
                        ptr++;
1648
1649
4.28M
                    } else {
1650
4.28M
                        if (tag_val_length < HTML_STR_LENGTH) {
1651
3.59M
                            tag_val[tag_val_length++] = tolower(*ptr);
1652
3.59M
                        }
1653
4.28M
                        ptr++;
1654
4.28M
                    }
1655
5.79M
                    if (*ptr == '\\') {
1656
11.1k
                        escape = true;
1657
5.78M
                    } else {
1658
5.78M
                        escape = false;
1659
5.78M
                    }
1660
5.79M
                    break;
1661
107k
                case HTML_RFC2397_INIT:
1662
107k
                    if (dirname) {
1663
107k
                        STATBUF statbuf;
1664
1665
107k
                        if (NULL != file_tmp_o1) {
1666
5.33k
                            if (file_tmp_o1->fd != -1) {
1667
5.33k
                                html_output_flush(file_tmp_o1);
1668
5.33k
                                close(file_tmp_o1->fd);
1669
5.33k
                                file_tmp_o1->fd = -1;
1670
5.33k
                            }
1671
5.33k
                            free(file_tmp_o1);
1672
5.33k
                        }
1673
1674
107k
                        file_tmp_o1 = (file_buff_t *)malloc(sizeof(file_buff_t));
1675
107k
                        if (!file_tmp_o1) {
1676
0
                            cli_errmsg("cli_html_normalise: Unable to allocate memory for file_tmp_o1\n");
1677
0
                            goto done;
1678
0
                        }
1679
107k
                        file_tmp_o1->fd = -1;
1680
1681
                        /* Create rfc2397 directory if it doesn't already exist */
1682
107k
                        snprintf(filename, 1024, "%s" PATHSEP "rfc2397", dirname);
1683
107k
                        if (LSTAT(filename, &statbuf) == -1) {
1684
17.0k
                            if (mkdir(filename, 0700) && errno != EEXIST) {
1685
0
                                cli_errmsg("Failed to create directory: %s\n", dirname);
1686
0
                                goto done;
1687
0
                            }
1688
17.0k
                        }
1689
1690
107k
                        tmp_file = cli_gentemp(filename);
1691
107k
                        if (!tmp_file) {
1692
0
                            goto done;
1693
0
                        }
1694
107k
                        cli_dbgmsg("RFC2397 data file: %s\n", tmp_file);
1695
107k
                        file_tmp_o1->fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);
1696
107k
                        free(tmp_file);
1697
107k
                        if (file_tmp_o1->fd < 0) {
1698
0
                            cli_dbgmsg("open failed: %s\n", filename);
1699
0
                            goto done;
1700
0
                        }
1701
107k
                        file_tmp_o1->length = 0;
1702
1703
107k
                        html_output_str(file_tmp_o1, (const unsigned char *)"From html-normalise\n", 20);
1704
107k
                        html_output_str(file_tmp_o1, (const unsigned char *)"Content-type: ", 14);
1705
107k
                        if ((tag_val_length == 0) && (*tag_val == ';')) {
1706
0
                            html_output_str(file_tmp_o1, (const unsigned char *)"text/plain\n", 11);
1707
0
                        }
1708
107k
                        html_output_str(file_tmp_o1, (const unsigned char *)tag_val, tag_val_length);
1709
107k
                        html_output_c(file_tmp_o1, '\n');
1710
107k
                        if (strstr(tag_val, ";base64") != NULL) {
1711
3.49k
                            html_output_str(file_tmp_o1, (const unsigned char *)"Content-transfer-encoding: base64\n", 34);
1712
3.49k
                        }
1713
107k
                        html_output_c(file_tmp_o1, '\n');
1714
107k
                    } else {
1715
0
                        file_tmp_o1 = NULL;
1716
0
                    }
1717
107k
                    state  = HTML_RFC2397_DATA;
1718
107k
                    binary = true;
1719
107k
                    break;
1720
57.3M
                case HTML_RFC2397_DATA:
1721
57.3M
                    if (*ptr == '&') {
1722
71.8k
                        state      = HTML_CHAR_REF;
1723
71.8k
                        next_state = HTML_RFC2397_DATA;
1724
71.8k
                        ptr++;
1725
57.3M
                    } else if (*ptr == '%') {
1726
414k
                        length     = 0;
1727
414k
                        value      = 0;
1728
414k
                        state      = HTML_ESCAPE_CHAR;
1729
414k
                        next_state = HTML_RFC2397_ESC;
1730
414k
                        ptr++;
1731
56.8M
                    } else if (*ptr == '\'') {
1732
127k
                        if (!escape && (quoted == SINGLE_QUOTED)) {
1733
68.6k
                            state = HTML_RFC2397_FINISH;
1734
68.6k
                            ptr++;
1735
68.6k
                        } else {
1736
59.0k
                            html_output_c(file_tmp_o1, *ptr);
1737
59.0k
                            ptr++;
1738
59.0k
                        }
1739
56.7M
                    } else if (*ptr == '\"') {
1740
7.17M
                        if (!escape && (quoted == DOUBLE_QUOTED)) {
1741
7.75k
                            state = HTML_RFC2397_FINISH;
1742
7.75k
                            ptr++;
1743
7.16M
                        } else {
1744
7.16M
                            html_output_c(file_tmp_o1, *ptr);
1745
7.16M
                            ptr++;
1746
7.16M
                        }
1747
49.5M
                    } else if (isspace(*ptr) || (*ptr == '>')) {
1748
6.62M
                        if (quoted == NOT_QUOTED) {
1749
20.3k
                            state = HTML_RFC2397_FINISH;
1750
20.3k
                            ptr++;
1751
6.60M
                        } else {
1752
6.60M
                            html_output_c(file_tmp_o1, *ptr);
1753
6.60M
                            ptr++;
1754
6.60M
                        }
1755
42.9M
                    } else {
1756
42.9M
                        html_output_c(file_tmp_o1, *ptr);
1757
42.9M
                        ptr++;
1758
42.9M
                    }
1759
57.3M
                    if (*ptr == '\\') {
1760
62.3k
                        escape = true;
1761
57.3M
                    } else {
1762
57.3M
                        escape = false;
1763
57.3M
                    }
1764
57.3M
                    break;
1765
96.8k
                case HTML_RFC2397_FINISH:
1766
96.8k
                    if (file_tmp_o1) {
1767
96.8k
                        if (file_tmp_o1->fd != -1) {
1768
96.8k
                            html_output_flush(file_tmp_o1);
1769
96.8k
                            close(file_tmp_o1->fd);
1770
96.8k
                            file_tmp_o1->fd = -1;
1771
96.8k
                        }
1772
96.8k
                        free(file_tmp_o1);
1773
96.8k
                        file_tmp_o1 = NULL;
1774
96.8k
                    }
1775
96.8k
                    state      = HTML_SKIP_WS;
1776
96.8k
                    escape     = false;
1777
96.8k
                    quoted     = NOT_QUOTED;
1778
96.8k
                    next_state = HTML_TAG_ARG;
1779
96.8k
                    binary     = false;
1780
96.8k
                    break;
1781
414k
                case HTML_RFC2397_ESC:
1782
414k
                    if (length == 2) {
1783
52.5k
                        html_output_c(file_tmp_o1, value);
1784
361k
                    } else if (length == 1) {
1785
361k
                        html_output_c(file_tmp_o1, '%');
1786
361k
                        html_output_c(file_tmp_o1, value + '0');
1787
361k
                    } else {
1788
0
                        html_output_c(file_tmp_o1, '%');
1789
0
                    }
1790
414k
                    state = HTML_RFC2397_DATA;
1791
414k
                    break;
1792
466k
                case HTML_ESCAPE_CHAR:
1793
466k
                    if (value < INT64_MAX / 16) {
1794
466k
                        value *= 16;
1795
466k
                    } else {
1796
0
                        state      = next_state;
1797
0
                        next_state = HTML_BAD_STATE;
1798
0
                        ptr++;
1799
0
                        break;
1800
0
                    }
1801
466k
                    length++;
1802
466k
                    if (isxdigit(*ptr)) {
1803
76.6k
                        if (isdigit(*ptr)) {
1804
63.2k
                            value += (*ptr - '0');
1805
63.2k
                        } else {
1806
13.4k
                            value += (tolower(*ptr) - 'a' + 10);
1807
13.4k
                        }
1808
389k
                    } else {
1809
389k
                        state = next_state;
1810
389k
                    }
1811
466k
                    if (length == 2) {
1812
52.5k
                        state = next_state;
1813
52.5k
                    }
1814
466k
                    ptr++;
1815
466k
                    break;
1816
3.18G
            }
1817
3.18G
        }
1818
699k
        if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
1819
            /* end of line, append contents now, resume on next line */
1820
0
            html_tag_contents_append(&contents, href_contents_begin, ptr);
1821
699k
        ptrend = NULL;
1822
1823
699k
        if (js_state) {
1824
455k
            js_process(js_state, js_begin, js_end, line, ptr, in_tag, dirname);
1825
455k
            js_begin = js_end = NULL;
1826
455k
            if (in_tag == TAG_DONT_EXTRACT) {
1827
0
                js_state = NULL;
1828
0
            }
1829
455k
        }
1830
1831
699k
        if (in_tag == TAG_STYLE) {
1832
17.2k
            if (ptr < style_begin) {
1833
210
                cli_dbgmsg("cli_html_normalise: style chunk size underflow\n");
1834
210
                goto done;
1835
210
            }
1836
1837
17.0k
            size_t chunk_size = ptr - style_begin;
1838
1839
17.0k
            if (style_buff == NULL) {
1840
2.14k
                CLI_MAX_MALLOC_OR_GOTO_DONE(style_buff, chunk_size + 1);
1841
14.9k
            } else {
1842
14.9k
                CLI_MAX_REALLOC_OR_GOTO_DONE(style_buff, style_buff_size + chunk_size + 1);
1843
14.9k
            }
1844
1845
17.0k
            memcpy(style_buff + style_buff_size, style_begin, chunk_size);
1846
1847
17.0k
            style_buff_size += chunk_size;
1848
17.0k
            style_buff[style_buff_size] = '\0';
1849
17.0k
        }
1850
1851
699k
        if (look_for_screnc && ptr_screnc) {
1852
            /* start found, and stuff before it already processed */
1853
243k
            ptr        = ptr_screnc;
1854
243k
            ptr_screnc = NULL;
1855
243k
            state      = HTML_JSDECODE_LENGTH;
1856
243k
            next_state = HTML_BAD_STATE;
1857
243k
            continue;
1858
243k
        }
1859
455k
        free(line);
1860
455k
        ptr = line = cli_readchunk(stream_in, m_area, 8192);
1861
1862
455k
        if (in_tag == TAG_STYLE) {
1863
            // reset style_begin to start of the next line
1864
14.8k
            style_begin = line;
1865
14.8k
        }
1866
1867
455k
        if (in_screnc) {
1868
56.1k
            state      = HTML_JSDECODE_DECRYPT;
1869
56.1k
            next_state = HTML_BAD_STATE;
1870
399k
        } else if (look_for_screnc && !ptr_screnc &&
1871
399k
                   state != HTML_LOOKFOR_SCRENC) {
1872
127k
            saved_next_state = next_state;
1873
127k
            next_state       = state;
1874
127k
            state            = HTML_LOOKFOR_SCRENC;
1875
127k
        }
1876
1877
455k
        if (next_state == state) {
1878
            /* safeguard against infloop */
1879
27.6k
            cli_dbgmsg("htmlnorm.c: next_state == state, changing next_state\n");
1880
27.6k
            next_state = HTML_BAD_STATE;
1881
27.6k
        }
1882
455k
    }
1883
1884
259k
    if (style_buff != NULL) {
1885
        // Found contents of <style> ... </style> block.
1886
        // Search it for images embedded in the CSS.
1887
15.9k
        cl_error_t ret = html_style_block_handler(ctx, (const char *)style_buff);
1888
15.9k
        if (CL_SUCCESS != ret) {
1889
0
            cli_dbgmsg("Scan of image extracted from html <style> block returned: %s\n", cl_strerror(ret));
1890
0
            goto done;
1891
0
        }
1892
1893
15.9k
        free(style_buff);
1894
15.9k
        style_buff = NULL;
1895
15.9k
    }
1896
1897
259k
    if (dconf_entconv) {
1898
        /* handle "unfinished" entities */
1899
259k
        size_t i;
1900
259k
        const char *normalized;
1901
259k
        entity_val[entity_val_length] = '\0';
1902
259k
        normalized                    = entity_norm(&conv, entity_val);
1903
259k
        if (normalized) {
1904
8.40k
            for (i = 0; i < strlen(normalized); i++)
1905
7.46k
                html_output_c(file_buff_o2, normalized[i] & 0xff);
1906
258k
        } else {
1907
258k
            if (entity_val_length) {
1908
1.62k
                html_output_c(file_buff_o2, '&');
1909
179k
                for (i = 0; i < entity_val_length; i++)
1910
177k
                    html_output_c(file_buff_o2, tolower(entity_val[i]));
1911
1.62k
            }
1912
258k
        }
1913
259k
    }
1914
1915
259k
    retval = true;
1916
1917
259k
done:
1918
259k
    if (line) /* only needed for done case */
1919
626
        free(line);
1920
259k
    if (in_form_action)
1921
0
        free(in_form_action);
1922
259k
    if (in_ahref) /* tag not closed, force closing */
1923
0
        html_tag_contents_done(hrefs, in_ahref, &contents);
1924
1925
259k
    if (js_state) {
1926
        /*  output script so far */
1927
180k
        cli_js_parse_done(js_state);
1928
180k
        cli_js_output(js_state, dirname);
1929
180k
        cli_js_destroy(js_state);
1930
180k
        js_state = NULL;
1931
180k
    }
1932
259k
    html_tag_arg_free(&tag_args);
1933
259k
    if (!m_area) {
1934
0
        fclose(stream_in);
1935
0
    }
1936
259k
    if (file_buff_o2) {
1937
259k
        html_output_flush(file_buff_o2);
1938
259k
        if (file_buff_o2->fd != -1)
1939
259k
            close(file_buff_o2->fd);
1940
259k
        free(file_buff_o2);
1941
259k
    }
1942
259k
    if (file_buff_text) {
1943
259k
        html_output_flush(file_buff_text);
1944
259k
        if (file_buff_text->fd != -1)
1945
259k
            close(file_buff_text->fd);
1946
259k
        free(file_buff_text);
1947
259k
        file_buff_text = NULL;
1948
259k
    }
1949
259k
    if (file_tmp_o1) {
1950
5.33k
        if (file_tmp_o1->fd != -1) {
1951
5.33k
            html_output_flush(file_tmp_o1);
1952
5.33k
            close(file_tmp_o1->fd);
1953
5.33k
        }
1954
5.33k
        free(file_tmp_o1);
1955
5.33k
    }
1956
259k
    if (style_buff != NULL) {
1957
626
        free(style_buff);
1958
626
    }
1959
259k
    return retval;
1960
259k
}
1961
1962
bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
1963
0
{
1964
0
    m_area_t m_area;
1965
1966
0
    m_area.buffer = in_buff;
1967
0
    m_area.length = in_size;
1968
0
    m_area.offset = 0;
1969
0
    m_area.map    = NULL;
1970
1971
0
    return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
1972
0
}
1973
1974
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
1975
259k
{
1976
259k
    bool retval = false;
1977
259k
    m_area_t m_area;
1978
1979
259k
    m_area.length = map->len;
1980
259k
    m_area.offset = 0;
1981
259k
    m_area.map    = map;
1982
259k
    retval        = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
1983
259k
    return retval;
1984
259k
}
1985
1986
bool html_screnc_decode(fmap_t *map, const char *dirname)
1987
360k
{
1988
360k
    int count;
1989
360k
    bool retval         = false;
1990
360k
    unsigned char *line = NULL, tmpstr[6];
1991
360k
    unsigned char *ptr, filename[1024];
1992
360k
    int ofd;
1993
360k
    struct screnc_state screnc_state;
1994
360k
    m_area_t m_area;
1995
1996
360k
    memset(&m_area, 0, sizeof(m_area));
1997
360k
    m_area.length = map->len;
1998
360k
    m_area.offset = 0;
1999
360k
    m_area.map    = map;
2000
2001
360k
    snprintf((char *)filename, 1024, "%s" PATHSEP "screnc.html", dirname);
2002
360k
    ofd = open((const char *)filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IWUSR | S_IRUSR);
2003
2004
360k
    if (ofd < 0) {
2005
0
        cli_dbgmsg("open failed: %s\n", filename);
2006
0
        return false;
2007
0
    }
2008
2009
360k
    while ((line = cli_readchunk(NULL, &m_area, 8192)) != NULL) {
2010
360k
        ptr = (unsigned char *)strstr((char *)line, "#@~^");
2011
360k
        if (ptr) {
2012
360k
            break;
2013
360k
        }
2014
0
        free(line);
2015
0
        line = NULL;
2016
0
    }
2017
360k
    if (!line) {
2018
0
        goto done;
2019
0
    }
2020
2021
    /* Calculate the length of the encoded string */
2022
360k
    ptr += 4;
2023
360k
    count = 0;
2024
2.71M
    do {
2025
2.71M
        if (!*ptr) {
2026
66.8k
            free(line);
2027
66.8k
            ptr = line = cli_readchunk(NULL, &m_area, 8192);
2028
66.8k
            if (!line) {
2029
55.6k
                goto done;
2030
55.6k
            }
2031
66.8k
        }
2032
2.66M
        if (count < 6)
2033
2.03M
            tmpstr[count] = *ptr;
2034
2.66M
        count++;
2035
2.66M
        ptr++;
2036
2.66M
    } while (count < 8);
2037
2038
304k
    memset(&screnc_state, 0, sizeof(screnc_state));
2039
304k
    screnc_state.length = base64_chars[tmpstr[0]] < 0 ? 0 : base64_chars[tmpstr[0]] << 2;
2040
304k
    screnc_state.length += base64_chars[tmpstr[1]] >> 4;
2041
304k
    screnc_state.length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
2042
304k
    screnc_state.length += ((base64_chars[tmpstr[2]] >> 2) < 0 ? 0 : (base64_chars[tmpstr[2]] >> 2)) << 8;
2043
304k
    screnc_state.length += (base64_chars[tmpstr[2]] & 0x03) << 22;
2044
304k
    screnc_state.length += base64_chars[tmpstr[3]] < 0 ? 0 : base64_chars[tmpstr[3]] << 16;
2045
304k
    screnc_state.length += (base64_chars[tmpstr[4]] < 0 ? 0 : base64_chars[tmpstr[4]] << 2) << 24;
2046
304k
    screnc_state.length += ((base64_chars[tmpstr[5]] >> 4) < 0 ? 0 : (base64_chars[tmpstr[5]] >> 4)) << 24;
2047
304k
    cli_writen(ofd, "<script>", strlen("<script>"));
2048
725k
    while (screnc_state.length && line) {
2049
420k
        screnc_decode(ptr, &screnc_state);
2050
420k
        cli_writen(ofd, ptr, strlen((const char *)ptr));
2051
420k
        free(line);
2052
420k
        line = NULL;
2053
420k
        if (screnc_state.length) {
2054
392k
            ptr = line = cli_readchunk(NULL, &m_area, 8192);
2055
392k
        }
2056
420k
    }
2057
304k
    cli_writen(ofd, "</script>", strlen("</script>"));
2058
304k
    if (screnc_state.length)
2059
265k
        cli_dbgmsg("html_screnc_decode: missing %u bytes\n", screnc_state.length);
2060
304k
    retval = true;
2061
2062
360k
done:
2063
360k
    close(ofd);
2064
360k
    if (line) {
2065
10.9k
        free(line);
2066
10.9k
    }
2067
360k
    return retval;
2068
304k
}