Coverage Report

Created: 2026-03-31 07:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/suricata7/libhtp/htp/htp_util.c
Line
Count
Source
1
/***************************************************************************
2
 * Copyright (c) 2009-2010 Open Information Security Foundation
3
 * Copyright (c) 2010-2013 Qualys, Inc.
4
 * All rights reserved.
5
 * 
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are
8
 * met:
9
 * 
10
 * - Redistributions of source code must retain the above copyright
11
 *   notice, this list of conditions and the following disclaimer.
12
13
 * - Redistributions in binary form must reproduce the above copyright
14
 *   notice, this list of conditions and the following disclaimer in the
15
 *   documentation and/or other materials provided with the distribution.
16
17
 * - Neither the name of the Qualys, Inc. nor the names of its
18
 *   contributors may be used to endorse or promote products derived from
19
 *   this software without specific prior written permission.
20
 * 
21
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25
 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 ***************************************************************************/
33
34
/**
35
 * @file
36
 * @author Ivan Ristic <ivanr@webkreator.com>
37
 */
38
39
#include "htp_config_auto.h"
40
41
//inet_pton
42
#if _WIN32
43
#include <ws2tcpip.h>
44
#else // mac, linux, freebsd
45
#include <sys/types.h>
46
#include <sys/socket.h>
47
#include <netinet/in.h>
48
#include <arpa/inet.h>
49
#endif
50
51
#include "htp_private.h"
52
53
/**
54
 * Is character a linear white space character?
55
 *
56
 * @param[in] c
57
 * @return 0 or 1
58
 */
59
6.07M
int htp_is_lws(int c) {
60
6.07M
    if ((c == ' ') || (c == '\t')) return 1;
61
4.85M
    else return 0;
62
6.07M
}
63
64
/**
65
 * Is character a separator character?
66
 *
67
 * @param[in] c
68
 * @return 0 or 1
69
 */
70
5.88M
int htp_is_separator(int c) {
71
    /* separators = "(" | ")" | "<" | ">" | "@"
72
                  | "," | ";" | ":" | "\" | <">
73
                  | "/" | "[" | "]" | "?" | "="
74
                  | "{" | "}" | SP | HT         */
75
5.88M
    switch (c) {
76
2.87k
        case '(':
77
5.27k
        case ')':
78
128k
        case '<':
79
129k
        case '>':
80
131k
        case '@':
81
136k
        case ',':
82
142k
        case ';':
83
142k
        case ':':
84
143k
        case '\\':
85
146k
        case '"':
86
149k
        case '/':
87
150k
        case '[':
88
151k
        case ']':
89
152k
        case '?':
90
154k
        case '=':
91
155k
        case '{':
92
156k
        case '}':
93
177k
        case ' ':
94
177k
        case '\t':
95
177k
            return 1;
96
0
            break;
97
5.70M
        default:
98
5.70M
            return 0;
99
5.88M
    }
100
5.88M
}
101
102
/**
103
 * Is character a text character?
104
 *
105
 * @param[in] c
106
 * @return 0 or 1
107
 */
108
0
int htp_is_text(int c) {
109
0
    if (c == '\t') return 1;
110
0
    if (c < 32) return 0;
111
0
    return 1;
112
0
}
113
114
/**
115
 * Is character a token character?
116
 *
117
 * @param[in] c
118
 * @return 0 or 1
119
 */
120
5.94M
int htp_is_token(int c) {
121
    /* token = 1*<any CHAR except CTLs or separators> */
122
    /* CHAR  = <any US-ASCII character (octets 0 - 127)> */
123
5.94M
    if ((c < 32) || (c > 126)) return 0;
124
5.88M
    if (htp_is_separator(c)) return 0;
125
5.70M
    return 1;
126
5.88M
}
127
128
/**
129
 * Remove all line terminators (LF, CR or CRLF) from
130
 * the end of the line provided as input.
131
 *
132
 * @return 0 if nothing was removed, 1 if one or more LF characters were removed, or
133
 *         2 if one or more CR and/or LF characters were removed.
134
 */
135
3.32M
int htp_chomp(unsigned char *data, size_t *len) {
136
3.32M
    int r = 0;
137
138
    // Loop until there's no more stuff in the buffer
139
5.67M
    while (*len > 0) {
140
        // Try one LF first
141
5.64M
        if (data[*len - 1] == LF) {
142
1.98M
            (*len)--;
143
1.98M
            r = 1;
144
145
1.98M
            if (*len == 0) return r;
146
147
            // A CR is allowed before LF
148
1.98M
            if (data[*len - 1] == CR) {
149
678k
                (*len)--;
150
678k
                r = 2;
151
678k
            }
152
3.65M
        } else if (data[*len - 1] == CR) {
153
367k
            (*len)--;
154
367k
            r = 1;
155
3.28M
        } else return r;
156
5.64M
    }
157
158
33.6k
    return r;
159
3.32M
}
160
161
/**
162
 * Is character a white space character?
163
 *
164
 * @param[in] c
165
 * @return 0 or 1
166
 */
167
27.7M
int htp_is_space(int c) {
168
27.7M
    switch (c) {
169
619k
        case ' ':
170
687k
        case '\f':
171
748k
        case '\v':
172
1.11M
        case '\t':
173
1.49M
        case '\r':
174
1.60M
        case '\n':
175
1.60M
            return 1;
176
26.1M
        default:
177
26.1M
            return 0;
178
27.7M
    }
179
27.7M
}
180
181
/**
182
 * Converts request method, given as a string, into a number.
183
 *
184
 * @param[in] method
185
 * @return Method number of M_UNKNOWN
186
 */
187
864k
int htp_convert_method_to_number(bstr *method) {
188
864k
    if (method == NULL) return HTP_M_UNKNOWN;
189
190
    // TODO Optimize using parallel matching, or something similar.
191
192
864k
    if (bstr_cmp_c(method, "GET") == 0) return HTP_M_GET;
193
821k
    if (bstr_cmp_c(method, "PUT") == 0) return HTP_M_PUT;
194
820k
    if (bstr_cmp_c(method, "POST") == 0) return HTP_M_POST;
195
800k
    if (bstr_cmp_c(method, "DELETE") == 0) return HTP_M_DELETE;
196
800k
    if (bstr_cmp_c(method, "CONNECT") == 0) return HTP_M_CONNECT;
197
796k
    if (bstr_cmp_c(method, "OPTIONS") == 0) return HTP_M_OPTIONS;
198
796k
    if (bstr_cmp_c(method, "TRACE") == 0) return HTP_M_TRACE;
199
796k
    if (bstr_cmp_c(method, "PATCH") == 0) return HTP_M_PATCH;
200
796k
    if (bstr_cmp_c(method, "PROPFIND") == 0) return HTP_M_PROPFIND;
201
796k
    if (bstr_cmp_c(method, "PROPPATCH") == 0) return HTP_M_PROPPATCH;
202
796k
    if (bstr_cmp_c(method, "MKCOL") == 0) return HTP_M_MKCOL;
203
796k
    if (bstr_cmp_c(method, "COPY") == 0) return HTP_M_COPY;
204
796k
    if (bstr_cmp_c(method, "MOVE") == 0) return HTP_M_MOVE;
205
796k
    if (bstr_cmp_c(method, "LOCK") == 0) return HTP_M_LOCK;
206
796k
    if (bstr_cmp_c(method, "UNLOCK") == 0) return HTP_M_UNLOCK;
207
796k
    if (bstr_cmp_c(method, "VERSION-CONTROL") == 0) return HTP_M_VERSION_CONTROL;
208
796k
    if (bstr_cmp_c(method, "CHECKOUT") == 0) return HTP_M_CHECKOUT;
209
796k
    if (bstr_cmp_c(method, "UNCHECKOUT") == 0) return HTP_M_UNCHECKOUT;
210
796k
    if (bstr_cmp_c(method, "CHECKIN") == 0) return HTP_M_CHECKIN;
211
796k
    if (bstr_cmp_c(method, "UPDATE") == 0) return HTP_M_UPDATE;
212
796k
    if (bstr_cmp_c(method, "LABEL") == 0) return HTP_M_LABEL;
213
796k
    if (bstr_cmp_c(method, "REPORT") == 0) return HTP_M_REPORT;
214
796k
    if (bstr_cmp_c(method, "MKWORKSPACE") == 0) return HTP_M_MKWORKSPACE;
215
796k
    if (bstr_cmp_c(method, "MKACTIVITY") == 0) return HTP_M_MKACTIVITY;
216
796k
    if (bstr_cmp_c(method, "BASELINE-CONTROL") == 0) return HTP_M_BASELINE_CONTROL;
217
796k
    if (bstr_cmp_c(method, "MERGE") == 0) return HTP_M_MERGE;
218
796k
    if (bstr_cmp_c(method, "INVALID") == 0) return HTP_M_INVALID;
219
796k
    if (bstr_cmp_c(method, "HEAD") == 0) return HTP_M_HEAD;
220
221
796k
    return HTP_M_UNKNOWN;
222
796k
}
223
224
/**
225
 * Is the given line empty?
226
 *
227
 * @param[in] data
228
 * @param[in] len
229
 * @return 0 or 1
230
 */
231
3.18M
int htp_is_line_empty(unsigned char *data, size_t len) {
232
3.18M
    if (((len == 1) && ((data[0] == CR) || (data[0] == LF))) ||
233
2.24M
        ((len == 2) && (data[0] == CR) && (data[1] == LF))) {
234
1.05M
        return 1;
235
1.05M
    }
236
237
2.13M
    return 0;
238
3.18M
}
239
240
/**
241
 * Does line consist entirely of whitespace characters?
242
 * 
243
 * @param[in] data
244
 * @param[in] len
245
 * @return 0 or 1
246
 */
247
0
int htp_is_line_whitespace(unsigned char *data, size_t len) {
248
0
    size_t i;
249
250
0
    for (i = 0; i < len; i++) {
251
0
        if (!isspace(data[i])) {
252
0
            return 0;
253
0
        }
254
0
    }
255
256
0
    return 1;
257
0
}
258
259
/**
260
 * Parses Content-Length string (positive decimal number).
261
 * White space is allowed before and after the number.
262
 *
263
 * @param[in] b
264
 * @return Content-Length as a number, or -1 on error.
265
 */
266
118k
int64_t htp_parse_content_length(bstr *b, htp_connp_t *connp) {
267
118k
    size_t len = bstr_len(b);
268
118k
    unsigned char * data = (unsigned char *) bstr_ptr(b);
269
118k
    size_t pos = 0;
270
118k
    int64_t r = 0;
271
272
118k
    if (len == 0) return -1003;
273
274
    // Ignore junk before
275
352k
    while ((pos < len) && (data[pos] < '0' || data[pos] > '9')) {
276
234k
        if (!htp_is_lws(data[pos]) && connp != NULL && r == 0) {
277
20.1k
            htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
278
20.1k
                    "C-L value with extra data in the beginning");
279
20.1k
            r = -1;
280
20.1k
        }
281
234k
        pos++;
282
234k
    }
283
117k
    if (pos == len) return -1001;
284
285
116k
    r = bstr_util_mem_to_pint(data + pos, len - pos, 10, &pos);
286
    // Ok to have junk afterwards
287
116k
    if (pos < len && connp != NULL) {
288
5.95k
        htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
289
5.95k
                "C-L value with extra data in the end");
290
5.95k
    }
291
116k
    return r;
292
117k
}
293
294
/**
295
 * Parses chunk length (positive hexadecimal number). White space is allowed before
296
 * and after the number. An error will be returned if the chunk length is greater than
297
 * INT32_MAX.
298
 *
299
 * @param[in] data
300
 * @param[in] len
301
 * @return Chunk length, or a negative number on error.
302
 */
303
32.8k
int64_t htp_parse_chunked_length(unsigned char *data, size_t len, int *extension) {
304
    // skip leading line feeds and other control chars
305
54.2k
    while (len) {
306
52.3k
        unsigned char c = *data;
307
52.3k
        if (!(c == 0x0d || c == 0x0a || c == 0x20 || c == 0x09 || c == 0x0b || c == 0x0c))
308
30.8k
            break;
309
21.4k
        data++;
310
21.4k
        len--;
311
21.4k
    }
312
32.8k
    if (len == 0)
313
1.97k
        return -1004;
314
315
    // find how much of the data is correctly formatted
316
30.8k
    size_t i = 0;
317
67.7k
    while (i < len) {
318
65.9k
        unsigned char c = data[i];
319
65.9k
        if (!(isdigit(c) ||
320
45.0k
            (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')))
321
29.0k
            break;
322
36.8k
        i++;
323
36.8k
    }
324
    // cut off trailing junk
325
30.8k
    if (i != len) {
326
29.0k
        if (extension) {
327
29.0k
            size_t j = i;
328
609k
            while (j < len) {
329
583k
                if (data[j] == ';') {
330
3.10k
                    *extension = 1;
331
3.10k
                    break;
332
3.10k
                }
333
580k
                j++;
334
580k
            }
335
29.0k
        }
336
29.0k
        len = i;
337
29.0k
    }
338
339
30.8k
    int64_t chunk_len = htp_parse_positive_integer_whitespace(data, len, 16);
340
30.8k
    if (chunk_len < 0) return chunk_len;
341
30.5k
    if (chunk_len > INT32_MAX) return -1;
342
30.5k
    return chunk_len;
343
30.5k
}
344
345
/**
346
 * A somewhat forgiving parser for a positive integer in a given base.
347
 * Only LWS is allowed before and after the number.
348
 * 
349
 * @param[in] data
350
 * @param[in] len
351
 * @param[in] base
352
 * @return The parsed number on success; a negative number on error.
353
 */
354
106k
int64_t htp_parse_positive_integer_whitespace(unsigned char *data, size_t len, int base) {
355
106k
    if (len == 0) return -1003;
356
357
104k
    size_t last_pos;
358
104k
    size_t pos = 0;
359
360
    // Ignore LWS before
361
109k
    while ((pos < len) && (htp_is_lws(data[pos]))) pos++;
362
104k
    if (pos == len) return -1001;
363
364
104k
    int64_t r = bstr_util_mem_to_pint(data + pos, len - pos, base, &last_pos);
365
104k
    if (r < 0) return r;
366
367
    // Move after the last digit
368
87.2k
    pos += last_pos;
369
370
    // Ignore LWS after
371
88.3k
    while (pos < len) {
372
5.38k
        if (!htp_is_lws(data[pos])) {
373
4.19k
            return -1002;
374
4.19k
        }
375
376
1.19k
        pos++;
377
1.19k
    }
378
379
83.0k
    return r;
380
87.2k
}
381
382
#ifdef HTP_DEBUG
383
384
/**
385
 * Prints one log message to stderr.
386
 *
387
 * @param[in] stream
388
 * @param[in] log
389
 */
390
void htp_print_log(FILE *stream, htp_log_t *log) {
391
    if (log->code != 0) {
392
        fprintf(stream, "[%d][code %d][file %s][line %d] %s\n", log->level,
393
                log->code, log->file, log->line, log->msg);
394
    } else {
395
        fprintf(stream, "[%d][file %s][line %d] %s\n", log->level,
396
                log->file, log->line, log->msg);
397
    }
398
}
399
#endif
400
401
/**
402
 * Records one log message.
403
 * 
404
 * @param[in] connp
405
 * @param[in] file
406
 * @param[in] line
407
 * @param[in] level
408
 * @param[in] code
409
 * @param[in] fmt
410
 */
411
1.86M
void htp_log(htp_connp_t *connp, const char *file, int line, enum htp_log_level_t level, int code, const char *fmt, ...) {
412
1.86M
    if (connp == NULL) return;
413
414
1.86M
    char buf[1024];
415
1.86M
    va_list args;
416
417
    // Ignore messages below our log level.
418
1.86M
    if (connp->cfg->log_level < level) {
419
0
        return;
420
0
    }
421
422
1.86M
    va_start(args, fmt);
423
424
1.86M
    int r = vsnprintf(buf, 1024, fmt, args);
425
426
1.86M
    va_end(args);
427
428
1.86M
    if (r < 0) {
429
0
        snprintf(buf, 1024, "[vnsprintf returned error %d]", r);
430
1.86M
    } else if (r >= 1024) {
431
        // Indicate overflow with a '+' at the end.
432
0
        buf[1022] = '+';
433
0
        buf[1023] = '\0';
434
0
    }
435
436
    // Create a new log entry.
437
438
1.86M
    htp_log_t *log = calloc(1, sizeof (htp_log_t));
439
1.86M
    if (log == NULL) return;
440
441
1.86M
    log->connp = connp;
442
1.86M
    log->file = file;
443
1.86M
    log->line = line;
444
1.86M
    log->level = level;
445
1.86M
    log->code = code;
446
1.86M
    log->msg = strdup(buf);
447
448
1.86M
    if (htp_list_add(connp->conn->messages, log) != HTP_OK) {
449
0
        free((void *) log->msg);
450
0
        free(log);
451
0
        return;
452
0
    }
453
454
1.86M
    if (level == HTP_LOG_ERROR) {
455
65.6k
        connp->last_error = log;
456
65.6k
    }
457
458
    #ifdef HTP_DEBUG
459
    fprintf(stderr, "[LOG] %s\n", log->msg);
460
    #endif
461
462
    /* coverity[check_return] */
463
1.86M
    htp_hook_run_all(connp->cfg->hook_log, log);
464
1.86M
}
465
466
/**
467
 * Determines if the given line is a continuation (of some previous line).
468
 * 
469
 * @param[in] data
470
 * @param[in] len
471
 * @return 0 or 1 for false and true, respectively. Returns -1 on error (NULL pointer or length zero).
472
 */
473
1.48M
int htp_connp_is_line_folded(unsigned char *data, size_t len) {
474
1.48M
    if ((data == NULL) || (len == 0)) return -1;
475
1.46M
    return htp_is_folding_char(data[0]);
476
1.48M
}
477
478
2.72M
int htp_is_folding_char(int c) {
479
2.72M
    if (htp_is_lws(c) || c == 0) return 1;
480
2.13M
    else return 0;
481
2.72M
}
482
483
/**
484
 * Determines if the given line is a request terminator.
485
 *
486
 * @param[in] connp
487
 * @param[in] data
488
 * @param[in] len
489
 * @return 0 or 1
490
 */
491
3.18M
int htp_connp_is_line_terminator(htp_connp_t *connp, unsigned char *data, size_t len, int next_no_lf) {
492
    // Is this the end of request headers?
493
3.18M
    switch (connp->cfg->server_personality) {
494
0
        case HTP_SERVER_IIS_5_1:
495
            // IIS 5 will accept a whitespace line as a terminator
496
0
            if (htp_is_line_whitespace(data, len)) {
497
0
                return 1;
498
0
            }
499
500
            // Fall through
501
3.18M
        default:
502
            // Treat an empty line as terminator
503
3.18M
            if (htp_is_line_empty(data, len)) {
504
1.05M
                return 1;
505
1.05M
            }
506
            // Only space is terminator if terminator does not follow right away
507
2.13M
            if (len == 2 && htp_is_lws(data[0]) && data[1] == LF) {
508
152k
                return next_no_lf;
509
152k
            }
510
1.97M
            break;
511
3.18M
    }
512
513
1.97M
    return 0;
514
3.18M
}
515
516
/**
517
 * Determines if the given line can be ignored when it appears before a request.
518
 *
519
 * @param[in] connp
520
 * @param[in] data
521
 * @param[in] len
522
 * @return 0 or 1
523
 */
524
1.43M
int htp_connp_is_line_ignorable(htp_connp_t *connp, unsigned char *data, size_t len) {
525
1.43M
    return htp_connp_is_line_terminator(connp, data, len, 0);
526
1.43M
}
527
528
14.8k
static htp_status_t htp_parse_port(unsigned char *data, size_t len, int *port, int *invalid) {
529
14.8k
    if (len == 0) {
530
410
        *port = -1;
531
410
        *invalid = 1;
532
410
        return HTP_OK;
533
410
    }
534
535
14.4k
    int64_t port_parsed = htp_parse_positive_integer_whitespace(data, len, 10);
536
537
14.4k
    if (port_parsed < 0) {
538
        // Failed to parse the port number.
539
4.56k
        *port = -1;
540
4.56k
        *invalid = 1;
541
9.85k
    } else if ((port_parsed > 0) && (port_parsed < 65536)) {
542
        // Valid port number.
543
9.09k
        *port = (int) port_parsed;
544
9.09k
    } else {
545
        // Port number out of range.
546
766
        *port = -1;
547
766
        *invalid = 1;
548
766
    }
549
550
14.4k
    return HTP_OK;
551
14.8k
}
552
553
/**
554
 * Parses an authority string, which consists of a hostname with an optional port number; username
555
 * and password are not allowed and will not be handled.
556
 *
557
 * @param[in] hostport
558
 * @param[out] hostname A bstring containing the hostname, or NULL if the hostname is invalid. If this value
559
 *                      is not NULL, the caller assumes responsibility for memory management.
560
 * @param[out] port Port as text, or NULL if not provided.
561
 * @param[out] port_number Port number, or -1 if the port is not present or invalid.
562
 * @param[out] invalid Set to 1 if any part of the authority is invalid.
563
 * @return HTP_OK on success, HTP_ERROR on memory allocation failure.
564
 */
565
36.9k
htp_status_t htp_parse_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, int *invalid) {
566
36.9k
    if ((hostport == NULL) || (hostname == NULL) || (port_number == NULL) || (invalid == NULL)) return HTP_ERROR;
567
568
36.9k
    *hostname = NULL;
569
36.9k
    if (port != NULL) {
570
3.90k
        *port = NULL;
571
3.90k
    }
572
36.9k
    *port_number = -1;
573
36.9k
    *invalid = 0;
574
575
36.9k
    unsigned char *data = bstr_ptr(hostport);
576
36.9k
    size_t len = bstr_len(hostport);
577
578
36.9k
    bstr_util_mem_trim(&data, &len);
579
580
36.9k
    if (len == 0) {
581
2.09k
        *invalid = 1;
582
2.09k
        return HTP_OK;
583
2.09k
    }
584
585
    // Check for an IPv6 address.
586
34.8k
    if (data[0] == '[') {
587
        // IPv6 host.
588
589
        // Find the end of the IPv6 address.
590
3.29k
        size_t pos = 0;
591
105k
        while ((pos < len) && (data[pos] != ']')) pos++;
592
3.29k
        if (pos == len) {
593
1.30k
            *invalid = 1;
594
1.30k
            return HTP_OK;
595
1.30k
        }
596
597
1.99k
        *hostname = bstr_dup_mem(data, pos + 1);
598
1.99k
        if (*hostname == NULL) return HTP_ERROR;
599
600
        // Over the ']'.
601
1.99k
        pos++;
602
1.99k
        if (pos == len) return HTP_OK;
603
604
        // Handle port.
605
1.11k
        if (data[pos] == ':') {
606
580
            if (port != NULL) {
607
0
                *port = bstr_dup_mem(data + pos + 1, len - pos - 1);
608
0
                if (*port == NULL) {
609
0
                    bstr_free(*hostname);
610
0
                    return HTP_ERROR;
611
0
                }
612
0
            }
613
614
580
            return htp_parse_port(data + pos + 1, len - pos - 1, port_number, invalid);
615
580
        } else {
616
538
            *invalid = 1;
617
538
            return HTP_OK;
618
538
        }
619
31.5k
    } else {
620
        // Not IPv6 host.
621
622
        // Is there a colon?
623
31.5k
        unsigned char *colon = memchr(data, ':', len);
624
31.5k
        if (colon == NULL) {
625
            // Hostname alone, no port.
626
17.3k
            if (data[0] == '/' && (len == 1 || data[1] != '/')) {
627
                //If it starts with "//", we should skip (might have parsed a scheme and no creds)
628
                //If it starts with '/', this is a path, not a hostname
629
33
                *invalid = 1;
630
33
                return HTP_OK;
631
33
            }
632
633
17.2k
            *hostname = bstr_dup_mem(data, len);
634
17.2k
            if (*hostname == NULL) return HTP_ERROR;
635
636
17.2k
            bstr_to_lowercase(*hostname);
637
17.2k
        } else {
638
            // Hostname and port.
639
640
            // Ignore whitespace at the end of hostname.
641
14.2k
            unsigned char *hostend = colon;
642
16.9k
            while ((hostend > data) && (isspace(*(hostend - 1)))) hostend--;
643
644
14.2k
            *hostname = bstr_dup_mem(data, hostend - data);
645
14.2k
            if (*hostname == NULL) return HTP_ERROR;
646
647
14.2k
            if (port != NULL) {
648
3.46k
                *port = bstr_dup_mem(colon + 1, len - (colon + 1 - data));
649
3.46k
                if (*port == NULL) {
650
0
                    bstr_free(*hostname);
651
0
                    return HTP_ERROR;
652
0
                }
653
3.46k
            }
654
655
14.2k
            return htp_parse_port(colon + 1, len - (colon + 1 - data), port_number, invalid);
656
14.2k
        }
657
31.5k
    }
658
659
17.2k
    return HTP_OK;
660
34.8k
}
661
662
/**
663
 * Parses hostport provided in the URI.
664
 *
665
 * @param[in] connp
666
 * @param[in] hostport
667
 * @param[in] uri
668
 * @return HTP_OK on success or HTP_ERROR error.
669
 */
670
3.90k
int htp_parse_uri_hostport(htp_connp_t *connp, bstr *hostport, htp_uri_t *uri) {
671
3.90k
    int invalid;
672
673
3.90k
    htp_status_t rc = htp_parse_hostport(hostport, &(uri->hostname), &(uri->port), &(uri->port_number), &invalid);
674
3.90k
    if (rc != HTP_OK) return rc;
675
676
3.90k
    if (invalid) {
677
177
        connp->in_tx->flags |= HTP_HOSTU_INVALID;
678
177
    }
679
680
3.90k
    if (uri->hostname != NULL) {
681
3.90k
        if (htp_validate_hostname(uri->hostname) == 0) {
682
456
            connp->in_tx->flags |= HTP_HOSTU_INVALID;
683
456
        }
684
3.90k
    }
685
686
3.90k
    return HTP_OK;
687
3.90k
}
688
689
/**
690
 * Parses hostport provided in the Host header.
691
 * 
692
 * @param[in] hostport
693
 * @param[out] hostname
694
 * @param[out] port
695
 * @param[out] port_number
696
 * @param[out] flags
697
 * @return HTP_OK on success or HTP_ERROR error.
698
 */
699
33.0k
htp_status_t htp_parse_header_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, uint64_t *flags) {
700
33.0k
    int invalid;
701
702
33.0k
    htp_status_t rc = htp_parse_hostport(hostport, hostname, port, port_number, &invalid);
703
33.0k
    if (rc != HTP_OK) return rc;
704
705
33.0k
    if (invalid) {
706
9.53k
        *flags |= HTP_HOSTH_INVALID;
707
9.53k
    }
708
709
33.0k
    if (*hostname != NULL) {
710
29.6k
        if (htp_validate_hostname(*hostname) == 0) {
711
10.7k
            *flags |= HTP_HOSTH_INVALID;
712
10.7k
        }
713
29.6k
    }
714
715
33.0k
    return HTP_OK;
716
33.0k
}
717
718
/**
719
 * Parses request URI, making no attempt to validate the contents.
720
 * 
721
 * @param[in] input
722
 * @param[in] uri
723
 * @return HTP_ERROR on memory allocation failure, HTP_OK otherwise
724
 */
725
207k
int htp_parse_uri(bstr *input, htp_uri_t **uri) {
726
    // Allow a htp_uri_t structure to be provided on input,
727
    // but allocate a new one if the structure is NULL.
728
207k
    if (*uri == NULL) {
729
0
        *uri = calloc(1, sizeof (htp_uri_t));
730
0
        if (*uri == NULL) return HTP_ERROR;
731
0
    }
732
733
207k
    if (input == NULL) {
734
        // The input might be NULL on requests that don't actually
735
        // contain the URI. We allow that.
736
74.2k
        return HTP_OK;
737
74.2k
    }
738
739
132k
    unsigned char *data = bstr_ptr(input);
740
132k
    size_t len = bstr_len(input);
741
    // remove trailing spaces
742
132k
    while (len > 0) {
743
132k
        if (data[len-1] != ' ') {
744
132k
            break;
745
132k
        }
746
0
        len--;
747
0
    }
748
132k
    size_t start, pos;
749
750
132k
    if (len == 0) {
751
        // Empty string.
752
0
        return HTP_OK;
753
0
    }
754
755
132k
    pos = 0;
756
757
    // Scheme test: if it doesn't start with a forward slash character (which it must
758
    // for the contents to be a path or an authority, then it must be the scheme part
759
132k
    if (data[0] != '/') {
760
        // Parse scheme        
761
762
        // Find the colon, which marks the end of the scheme part
763
97.8k
        start = pos;
764
6.12M
        while ((pos < len) && (data[pos] != ':')) pos++;
765
766
97.8k
        if (pos >= len) {
767
            // We haven't found a colon, which means that the URI
768
            // is invalid. Apache will ignore this problem and assume
769
            // the URI contains an invalid path so, for the time being,
770
            // we are going to do the same.
771
56.6k
            pos = 0;
772
56.6k
        } else {
773
            // Make a copy of the scheme
774
41.1k
            (*uri)->scheme = bstr_dup_mem(data + start, pos - start);
775
41.1k
            if ((*uri)->scheme == NULL) return HTP_ERROR;
776
777
            // Go over the colon
778
41.1k
            pos++;
779
41.1k
        }
780
97.8k
    }
781
782
    // Authority test: two forward slash characters and it's an authority.
783
    // One, three or more slash characters, and it's a path. We, however,
784
    // only attempt to parse authority if we've seen a scheme.
785
132k
    if ((*uri)->scheme != NULL)
786
41.1k
        if ((pos + 2 < len) && (data[pos] == '/') && (data[pos + 1] == '/') && (data[pos + 2] != '/')) {
787
            // Parse authority
788
789
            // Go over the two slash characters
790
24.7k
            start = pos = pos + 2;
791
792
            // Authority ends with a question mark, forward slash or hash
793
2.59M
            while ((pos < len) && (data[pos] != '?') && (data[pos] != '/') && (data[pos] != '#')) pos++;
794
795
24.7k
            unsigned char *hostname_start;
796
24.7k
            size_t hostname_len;
797
798
            // Are the credentials included in the authority?
799
24.7k
            unsigned char *m = memchr(data + start, '@', pos - start);
800
24.7k
            if (m != NULL) {
801
                // Credentials present
802
5.40k
                unsigned char *credentials_start = data + start;
803
5.40k
                size_t credentials_len = m - data - start;
804
805
                // Figure out just the hostname part
806
5.40k
                hostname_start = data + start + credentials_len + 1;
807
5.40k
                hostname_len = pos - start - credentials_len - 1;
808
809
                // Extract the username and the password
810
5.40k
                m = memchr(credentials_start, ':', credentials_len);
811
5.40k
                if (m != NULL) {
812
                    // Username and password
813
3.48k
                    (*uri)->username = bstr_dup_mem(credentials_start, m - credentials_start);
814
3.48k
                    if ((*uri)->username == NULL) return HTP_ERROR;
815
3.48k
                    (*uri)->password = bstr_dup_mem(m + 1, credentials_len - (m - credentials_start) - 1);
816
3.48k
                    if ((*uri)->password == NULL) return HTP_ERROR;
817
3.48k
                } else {
818
                    // Username alone
819
1.91k
                    (*uri)->username = bstr_dup_mem(credentials_start, credentials_len);
820
1.91k
                    if ((*uri)->username == NULL) return HTP_ERROR;
821
1.91k
                }
822
19.3k
            } else {
823
                // No credentials
824
19.3k
                hostname_start = data + start;
825
19.3k
                hostname_len = pos - start;
826
19.3k
            }
827
828
            // Parsing authority without credentials.
829
24.7k
            if ((hostname_len > 0) && (hostname_start[0] == '[')) {
830
                // IPv6 address.
831
832
1.76k
                m = memchr(hostname_start, ']', hostname_len);
833
1.76k
                if (m == NULL) {
834
                    // Invalid IPv6 address; use the entire string as hostname.
835
1.03k
                    (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
836
1.03k
                    if ((*uri)->hostname == NULL) return HTP_ERROR;
837
1.03k
                } else {
838
730
                    (*uri)->hostname = bstr_dup_mem(hostname_start, m - hostname_start + 1);
839
730
                    if ((*uri)->hostname == NULL) return HTP_ERROR;
840
841
                    // Is there a port?
842
730
                    hostname_len = hostname_len - (m - hostname_start + 1);
843
730
                    hostname_start = m + 1;
844
845
                    // Port string
846
730
                    m = memchr(hostname_start, ':', hostname_len);
847
730
                    if (m != NULL) {
848
404
                        size_t port_len = hostname_len - (m - hostname_start) - 1;
849
404
                        (*uri)->port = bstr_dup_mem(m + 1, port_len);
850
404
                        if ((*uri)->port == NULL) return HTP_ERROR;
851
404
                    }
852
730
                }
853
23.0k
            } else {
854
                // Not IPv6 address.
855
856
23.0k
                m = memchr(hostname_start, ':', hostname_len);
857
23.0k
                if (m != NULL) {
858
9.11k
                    size_t port_len = hostname_len - (m - hostname_start) - 1;
859
9.11k
                    hostname_len = hostname_len - port_len - 1;
860
861
                    // Port string
862
9.11k
                    (*uri)->port = bstr_dup_mem(m + 1, port_len);
863
9.11k
                    if ((*uri)->port == NULL) return HTP_ERROR;
864
9.11k
                }
865
866
                // Hostname
867
23.0k
                (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
868
23.0k
                if ((*uri)->hostname == NULL) return HTP_ERROR;
869
23.0k
            }
870
24.7k
        }
871
872
    // Path
873
132k
    start = pos;
874
875
    // The path part will end with a question mark or a hash character, which
876
    // mark the beginning of the query part or the fragment part, respectively.
877
6.74M
    while ((pos < len) && (data[pos] != '?') && (data[pos] != '#')) pos++;
878
879
    // Path
880
132k
    (*uri)->path = bstr_dup_mem(data + start, pos - start);
881
132k
    if ((*uri)->path == NULL) return HTP_ERROR;
882
883
132k
    if (pos == len) return HTP_OK;
884
885
    // Query
886
16.8k
    if (data[pos] == '?') {
887
        // Step over the question mark
888
10.3k
        start = pos + 1;
889
890
        // The query part will end with the end of the input
891
        // or the beginning of the fragment part
892
913k
        while ((pos < len) && (data[pos] != '#')) pos++;
893
894
        // Query string
895
10.3k
        (*uri)->query = bstr_dup_mem(data + start, pos - start);
896
10.3k
        if ((*uri)->query == NULL) return HTP_ERROR;
897
898
10.3k
        if (pos == len) return HTP_OK;
899
10.3k
    }
900
901
    // Fragment
902
8.32k
    if (data[pos] == '#') {
903
        // Step over the hash character
904
8.32k
        start = pos + 1;
905
906
        // Fragment; ends with the end of the input
907
8.32k
        (*uri)->fragment = bstr_dup_mem(data + start, len - start);
908
8.32k
        if ((*uri)->fragment == NULL) return HTP_ERROR;
909
8.32k
    }
910
911
8.32k
    return HTP_OK;
912
8.32k
}
913
914
/**
915
 * Convert two input bytes, pointed to by the pointer parameter,
916
 * into a single byte by assuming the input consists of hexadecimal
917
 * characters. This function will happily convert invalid input.
918
 *
919
 * @param[in] what
920
 * @return hex-decoded byte
921
 */
922
22.0k
static unsigned char x2c(unsigned char *what) {
923
22.0k
    register unsigned char digit;
924
925
22.0k
    digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
926
22.0k
    digit *= 16;
927
22.0k
    digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
928
929
22.0k
    return digit;
930
22.0k
}
931
932
/**
933
 * Convert a Unicode codepoint into a single-byte, using best-fit
934
 * mapping (as specified in the provided configuration structure).
935
 *
936
 * @param[in] cfg
937
 * @param[in] codepoint
938
 * @return converted single byte
939
 */
940
0
static uint8_t bestfit_codepoint(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, uint32_t codepoint) {
941
    // Is it a single-byte codepoint?
942
0
    if (codepoint < 0x100) {
943
0
        return (uint8_t) codepoint;
944
0
    }
945
946
    // Our current implementation converts only the 2-byte codepoints.
947
0
    if (codepoint > 0xffff) {
948
0
        return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
949
0
    }
950
951
0
    uint8_t *p = cfg->decoder_cfgs[ctx].bestfit_map;
952
953
    // TODO Optimize lookup.
954
955
0
    for (;;) {
956
0
        uint32_t x = (p[0] << 8) + p[1];
957
958
0
        if (x == 0) {
959
0
            return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
960
0
        }
961
962
0
        if (x == codepoint) {
963
0
            return p[2];
964
0
        }
965
966
        // Move to the next triplet
967
0
        p += 3;
968
0
    }
969
0
}
970
971
/**
972
 * Decode a UTF-8 encoded path. Overlong characters will be decoded, invalid
973
 * characters will be left as-is. Best-fit mapping will be used to convert
974
 * UTF-8 into a single-byte stream.
975
 *
976
 * @param[in] cfg
977
 * @param[in] tx
978
 * @param[in] path
979
 */
980
0
void htp_utf8_decode_path_inplace(htp_cfg_t *cfg, htp_tx_t *tx, bstr *path) {
981
0
    if (path == NULL) return;
982
983
0
    uint8_t *data = bstr_ptr(path);
984
0
    if (data == NULL) return;
985
986
0
    size_t len = bstr_len(path);
987
0
    size_t rpos = 0;
988
0
    size_t wpos = 0;
989
0
    uint32_t codepoint = 0;
990
0
    uint32_t state = HTP_UTF8_ACCEPT;
991
0
    uint32_t counter = 0;
992
0
    uint8_t seen_valid = 0;
993
994
0
    while ((rpos < len)&&(wpos < len)) {
995
0
        counter++;
996
997
0
        switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
998
0
            case HTP_UTF8_ACCEPT:
999
0
                if (counter == 1) {
1000
                    // ASCII character, which we just copy.
1001
0
                    data[wpos++] = (uint8_t) codepoint;
1002
0
                } else {
1003
                    // A valid UTF-8 character, which we need to convert.
1004
1005
0
                    seen_valid = 1;
1006
1007
                    // Check for overlong characters and set the flag accordingly.
1008
0
                    switch (counter) {
1009
0
                        case 2:
1010
0
                            if (codepoint < 0x80) {
1011
0
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1012
0
                            }
1013
0
                            break;
1014
0
                        case 3:
1015
0
                            if (codepoint < 0x800) {
1016
0
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1017
0
                            }
1018
0
                            break;
1019
0
                        case 4:
1020
0
                            if (codepoint < 0x10000) {
1021
0
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1022
0
                            }
1023
0
                            break;
1024
0
                    }
1025
1026
                    // Special flag for half-width/full-width evasion.
1027
0
                    if ((codepoint >= 0xff00) && (codepoint <= 0xffef)) {
1028
0
                        tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1029
0
                    }
1030
1031
                    // Use best-fit mapping to convert to a single byte.
1032
0
                    data[wpos++] = bestfit_codepoint(cfg, HTP_DECODER_URL_PATH, codepoint);
1033
0
                }
1034
1035
                // Advance over the consumed byte and reset the byte counter.
1036
0
                rpos++;
1037
0
                counter = 0;
1038
1039
0
                break;
1040
1041
0
            case HTP_UTF8_REJECT:
1042
                // Invalid UTF-8 character.
1043
1044
0
                tx->flags |= HTP_PATH_UTF8_INVALID;
1045
1046
                // Is the server expected to respond with 400?
1047
0
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1048
0
                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted;
1049
0
                }
1050
1051
                // Output the replacement byte, replacing one or more invalid bytes.
1052
0
                data[wpos++] = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1053
1054
                // If the invalid byte was first in a sequence, consume it. Otherwise,
1055
                // assume it's the starting byte of the next character.
1056
0
                if (counter == 1) {
1057
0
                    rpos++;
1058
0
                }
1059
1060
                // Reset the decoder state and continue decoding.
1061
0
                state = HTP_UTF8_ACCEPT;
1062
0
                codepoint = 0;
1063
0
                counter = 0;
1064
1065
0
                break;
1066
1067
0
            default:
1068
                // Keep going; the character is not yet formed.
1069
0
                rpos++;
1070
0
                break;
1071
0
        }
1072
0
    }
1073
1074
    // Did the input stream seem like a valid UTF-8 string?
1075
0
    if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1076
0
        tx->flags |= HTP_PATH_UTF8_VALID;
1077
0
    }
1078
1079
    // Adjust the length of the string, because
1080
    // we're doing in-place decoding.
1081
0
    bstr_adjust_len(path, wpos);
1082
0
}
1083
1084
/**
1085
 * Validate a path that is quite possibly UTF-8 encoded.
1086
 * 
1087
 * @param[in] tx
1088
 * @param[in] path
1089
 */
1090
132k
void htp_utf8_validate_path(htp_tx_t *tx, bstr *path) {
1091
132k
    unsigned char *data = bstr_ptr(path);
1092
132k
    size_t len = bstr_len(path);
1093
132k
    size_t rpos = 0;
1094
132k
    uint32_t codepoint = 0;
1095
132k
    uint32_t state = HTP_UTF8_ACCEPT;
1096
132k
    uint32_t counter = 0; // How many bytes used by a UTF-8 character.
1097
132k
    uint8_t seen_valid = 0;
1098
1099
6.74M
    while (rpos < len) {
1100
6.60M
        counter++;
1101
1102
6.60M
        switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
1103
5.10M
            case HTP_UTF8_ACCEPT:
1104
                // We have a valid character.
1105
1106
5.10M
                if (counter > 1) {
1107
                    // A valid UTF-8 character, consisting of 2 or more bytes.
1108
1109
27.1k
                    seen_valid = 1;
1110
1111
                    // Check for overlong characters and set the flag accordingly.
1112
27.1k
                    switch (counter) {
1113
15.2k
                        case 2:
1114
15.2k
                            if (codepoint < 0x80) {
1115
2.04k
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1116
2.04k
                            }
1117
15.2k
                            break;
1118
9.38k
                        case 3:
1119
9.38k
                            if (codepoint < 0x800) {
1120
855
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1121
855
                            }
1122
9.38k
                            break;
1123
2.54k
                        case 4:
1124
2.54k
                            if (codepoint < 0x10000) {
1125
121
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1126
121
                            }
1127
2.54k
                            break;
1128
27.1k
                    }
1129
27.1k
                }
1130
1131
                // Special flag for half-width/full-width evasion.
1132
5.10M
                if ((codepoint > 0xfeff) && (codepoint < 0x010000)) {
1133
777
                    tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1134
777
                }
1135
1136
                // Advance over the consumed byte and reset the byte counter.
1137
5.10M
                rpos++;
1138
5.10M
                counter = 0;
1139
1140
5.10M
                break;
1141
1142
1.27M
            case HTP_UTF8_REJECT:
1143
                // Invalid UTF-8 character.
1144
1145
1.27M
                tx->flags |= HTP_PATH_UTF8_INVALID;
1146
1147
                // Override the decoder state because we want to continue decoding.
1148
1.27M
                state = HTP_UTF8_ACCEPT;
1149
1150
                // Advance over the consumed byte and reset the byte counter.
1151
1.27M
                rpos++;
1152
1.27M
                counter = 0;
1153
1154
1.27M
                break;
1155
1156
234k
            default:
1157
                // Keep going; the character is not yet formed.
1158
234k
                rpos++;
1159
234k
                break;
1160
6.60M
        }
1161
6.60M
    }
1162
1163
    // Did the input stream seem like a valid UTF-8 string?
1164
132k
    if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1165
1.33k
        tx->flags |= HTP_PATH_UTF8_VALID;
1166
1.33k
    }
1167
132k
}
1168
1169
/**
1170
 * Decode a %u-encoded character, using best-fit mapping as necessary. Path version.
1171
 *
1172
 * @param[in] cfg
1173
 * @param[in] tx
1174
 * @param[in] data
1175
 * @return decoded byte
1176
 */
1177
0
static uint8_t decode_u_encoding_path(htp_cfg_t *cfg, htp_tx_t *tx, unsigned char *data) {
1178
0
    uint8_t c1 = x2c(data);
1179
0
    uint8_t c2 = x2c(data + 2);
1180
0
    uint8_t r = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1181
1182
0
    if (c1 == 0x00) {
1183
0
        r = c2;
1184
0
        tx->flags |= HTP_PATH_OVERLONG_U;
1185
0
    } else {
1186
        // Check for fullwidth form evasion
1187
0
        if (c1 == 0xff) {
1188
0
            tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1189
0
        }
1190
1191
0
        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1192
0
            tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1193
0
        }
1194
1195
        // Use best-fit mapping
1196
0
        unsigned char *p = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_map;
1197
1198
        // TODO Optimize lookup.
1199
1200
0
        for (;;) {
1201
            // Have we reached the end of the map?
1202
0
            if ((p[0] == 0) && (p[1] == 0)) {
1203
0
                break;
1204
0
            }
1205
1206
            // Have we found the mapping we're looking for?
1207
0
            if ((p[0] == c1) && (p[1] == c2)) {
1208
0
                r = p[2];
1209
0
                break;
1210
0
            }
1211
1212
            // Move to the next triplet
1213
0
            p += 3;
1214
0
        }
1215
0
    }
1216
1217
    // Check for encoded path separators
1218
0
    if ((r == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (r == '\\'))) {
1219
0
        tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1220
0
    }
1221
1222
0
    return r;
1223
0
}
1224
1225
/**
1226
 * Decode a %u-encoded character, using best-fit mapping as necessary. Params version.
1227
 *
1228
 * @param[in] cfg
1229
 * @param[in] tx
1230
 * @param[in] data
1231
 * @return decoded byte
1232
 */
1233
0
static uint8_t decode_u_encoding_params(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, unsigned char *data, uint64_t *flags) {
1234
0
    uint8_t c1 = x2c(data);
1235
0
    uint8_t c2 = x2c(data + 2);
1236
1237
    // Check for overlong usage first.
1238
0
    if (c1 == 0) {
1239
0
        (*flags) |= HTP_URLEN_OVERLONG_U;
1240
0
        return c2;
1241
0
    }
1242
1243
    // Both bytes were used.
1244
1245
    // Detect half-width and full-width range.
1246
0
    if ((c1 == 0xff) && (c2 <= 0xef)) {
1247
0
        (*flags) |= HTP_URLEN_HALF_FULL_RANGE;
1248
0
    }
1249
1250
    // Use best-fit mapping.
1251
0
    unsigned char *p = cfg->decoder_cfgs[ctx].bestfit_map;
1252
0
    uint8_t r = cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
1253
1254
    // TODO Optimize lookup.
1255
1256
0
    for (;;) {
1257
        // Have we reached the end of the map?
1258
0
        if ((p[0] == 0) && (p[1] == 0)) {
1259
0
            break;
1260
0
        }
1261
1262
        // Have we found the mapping we're looking for?
1263
0
        if ((p[0] == c1) && (p[1] == c2)) {
1264
0
            r = p[2];
1265
0
            break;
1266
0
        }
1267
1268
        // Move to the next triplet
1269
0
        p += 3;
1270
0
    }
1271
1272
0
    return r;
1273
0
}
1274
1275
/**
1276
 * Decode a request path according to the settings in the
1277
 * provided configuration structure.
1278
 *
1279
 * @param[in] cfg
1280
 * @param[in] tx
1281
 * @param[in] path
1282
 */
1283
132k
htp_status_t htp_decode_path_inplace(htp_tx_t *tx, bstr *path) {
1284
132k
    if (path == NULL) return HTP_ERROR;
1285
132k
    unsigned char *data = bstr_ptr(path);
1286
132k
    if (data == NULL) return HTP_ERROR;
1287
1288
132k
    size_t len = bstr_len(path);
1289
1290
132k
    htp_cfg_t *cfg = tx->cfg;
1291
1292
132k
    size_t rpos = 0;
1293
132k
    size_t wpos = 0;
1294
132k
    int previous_was_separator = 0;
1295
1296
6.74M
    while ((rpos < len) && (wpos < len)) {
1297
6.60M
        uint8_t c = data[rpos];
1298
1299
        // Decode encoded characters
1300
6.60M
        if (c == '%') {
1301
53.2k
            if (rpos + 2 < len) {
1302
51.7k
                int handled = 0;
1303
1304
51.7k
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_decode) {
1305
                    // Check for the %u encoding
1306
0
                    if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1307
0
                        handled = 1;
1308
1309
0
                        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1310
0
                            tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1311
0
                        }
1312
1313
0
                        if (rpos + 5 < len) {
1314
0
                            if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1315
0
                                    && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1316
                                // Decode a valid %u encoding
1317
0
                                c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1318
0
                                rpos += 6;
1319
1320
0
                                if (c == 0) {
1321
0
                                    tx->flags |= HTP_PATH_ENCODED_NUL;
1322
1323
0
                                    if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1324
0
                                        tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1325
0
                                    }
1326
0
                                }
1327
0
                            } else {
1328
                                // Invalid %u encoding
1329
0
                                tx->flags |= HTP_PATH_INVALID_ENCODING;
1330
1331
0
                                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1332
0
                                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1333
0
                                }
1334
1335
0
                                switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1336
0
                                    case HTP_URL_DECODE_REMOVE_PERCENT:
1337
                                        // Do not place anything in output; eat
1338
                                        // the percent character
1339
0
                                        rpos++;
1340
0
                                        continue;
1341
0
                                        break;
1342
0
                                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1343
                                        // Leave the percent character in output
1344
0
                                        rpos++;
1345
0
                                        break;
1346
0
                                    case HTP_URL_DECODE_PROCESS_INVALID:
1347
                                        // Decode invalid %u encoding
1348
0
                                        c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1349
0
                                        rpos += 6;
1350
0
                                        break;
1351
0
                                }
1352
0
                            }
1353
0
                        } else {
1354
                            // Invalid %u encoding (not enough data)
1355
0
                            tx->flags |= HTP_PATH_INVALID_ENCODING;
1356
1357
0
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1358
0
                                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1359
0
                            }
1360
1361
0
                            switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1362
0
                                case HTP_URL_DECODE_REMOVE_PERCENT:
1363
                                    // Do not place anything in output; eat
1364
                                    // the percent character
1365
0
                                    rpos++;
1366
0
                                    continue;
1367
0
                                    break;
1368
0
                                case HTP_URL_DECODE_PRESERVE_PERCENT:
1369
                                    // Leave the percent character in output
1370
0
                                    rpos++;
1371
0
                                    break;
1372
0
                                case HTP_URL_DECODE_PROCESS_INVALID:
1373
                                    // Cannot decode, because there's not enough data.
1374
                                    // Leave the percent character in output
1375
0
                                    rpos++;
1376
                                    // TODO Configurable handling.
1377
0
                                    break;
1378
0
                            }
1379
0
                        }
1380
0
                    }
1381
0
                }
1382
1383
                // Handle standard URL encoding
1384
51.7k
                if (!handled) {
1385
51.7k
                    if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1386
3.48k
                        c = x2c(&data[rpos + 1]);
1387
1388
3.48k
                        if (c == 0) {
1389
672
                            tx->flags |= HTP_PATH_ENCODED_NUL;
1390
1391
672
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1392
0
                                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1393
0
                            }
1394
1395
672
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_terminates) {
1396
0
                                bstr_adjust_len(path, wpos);
1397
0
                                return HTP_OK;
1398
0
                            }
1399
672
                        }
1400
1401
3.48k
                        if ((c == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (c == '\\'))) {
1402
316
                            tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1403
1404
316
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1405
0
                                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted;
1406
0
                            }
1407
1408
316
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_decode) {
1409
                                // Decode
1410
0
                                rpos += 3;
1411
316
                            } else {
1412
                                // Leave encoded
1413
316
                                c = '%';
1414
316
                                rpos++;
1415
316
                            }
1416
3.17k
                        } else {
1417
                            // Decode
1418
3.17k
                            rpos += 3;
1419
3.17k
                        }
1420
48.2k
                    } else {
1421
                        // Invalid encoding
1422
48.2k
                        tx->flags |= HTP_PATH_INVALID_ENCODING;
1423
1424
48.2k
                        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1425
0
                            tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1426
0
                        }
1427
1428
48.2k
                        switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1429
0
                            case HTP_URL_DECODE_REMOVE_PERCENT:
1430
                                // Do not place anything in output; eat
1431
                                // the percent character
1432
0
                                rpos++;
1433
0
                                continue;
1434
0
                                break;
1435
48.2k
                            case HTP_URL_DECODE_PRESERVE_PERCENT:
1436
                                // Leave the percent character in output
1437
48.2k
                                rpos++;
1438
48.2k
                                break;
1439
0
                            case HTP_URL_DECODE_PROCESS_INVALID:
1440
                                // Decode
1441
0
                                c = x2c(&data[rpos + 1]);
1442
0
                                rpos += 3;
1443
                                // Note: What if an invalid encoding decodes into a path
1444
                                //       separator? This is theoretical at the moment, because
1445
                                //       the only platform we know doesn't convert separators is
1446
                                //       Apache, who will also respond with 400 if invalid encoding
1447
                                //       is encountered. Thus no check for a separator here.
1448
0
                                break;
1449
0
                            default:
1450
                                // Unknown setting
1451
0
                                return HTP_ERROR;
1452
0
                                break;
1453
48.2k
                        }
1454
48.2k
                    }
1455
51.7k
                }
1456
51.7k
            } else {
1457
                // Invalid URL encoding (not enough data)
1458
1.50k
                tx->flags |= HTP_PATH_INVALID_ENCODING;
1459
1460
1.50k
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1461
0
                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1462
0
                }
1463
1464
1.50k
                switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1465
0
                    case HTP_URL_DECODE_REMOVE_PERCENT:
1466
                        // Do not place anything in output; eat
1467
                        // the percent character
1468
0
                        rpos++;
1469
0
                        continue;
1470
0
                        break;
1471
1.50k
                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1472
                        // Leave the percent character in output
1473
1.50k
                        rpos++;
1474
1.50k
                        break;
1475
0
                    case HTP_URL_DECODE_PROCESS_INVALID:
1476
                        // Cannot decode, because there's not enough data.
1477
                        // Leave the percent character in output.
1478
                        // TODO Configurable handling.
1479
0
                        rpos++;
1480
0
                        break;
1481
1.50k
                }
1482
1.50k
            }
1483
6.55M
        } else {
1484
            // One non-encoded character
1485
1486
            // Is it a NUL byte?
1487
6.55M
            if (c == 0) {
1488
1.54M
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1489
0
                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted;
1490
0
                }
1491
1492
1.54M
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_terminates) {
1493
                    // Terminate path with a raw NUL byte
1494
0
                    bstr_adjust_len(path, wpos);
1495
0
                    return HTP_OK;
1496
0
                    break;
1497
0
                }
1498
1.54M
            }
1499
1500
6.55M
            rpos++;
1501
6.55M
        }
1502
1503
        // Place the character into output
1504
1505
        // Check for control characters
1506
6.60M
        if (c < 0x20) {
1507
2.17M
            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted != HTP_UNWANTED_IGNORE) {
1508
0
                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted;
1509
0
            }
1510
2.17M
        }
1511
1512
        // Convert backslashes to forward slashes, if necessary
1513
6.60M
        if ((c == '\\') && (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes)) {
1514
0
            c = '/';
1515
0
        }
1516
1517
        // Lowercase characters, if necessary
1518
6.60M
        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].convert_lowercase) {
1519
0
            c = (uint8_t) tolower(c);
1520
0
        }
1521
1522
        // If we're compressing separators then we need
1523
        // to track if the previous character was a separator
1524
6.60M
        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_compress) {
1525
0
            if (c == '/') {
1526
0
                if (!previous_was_separator) {
1527
0
                    data[wpos++] = c;
1528
0
                    previous_was_separator = 1;
1529
0
                } else {
1530
                    // Do nothing; we don't want
1531
                    // another separator in output
1532
0
                }
1533
0
            } else {
1534
0
                data[wpos++] = c;
1535
0
                previous_was_separator = 0;
1536
0
            }
1537
6.60M
        } else {
1538
6.60M
            data[wpos++] = c;
1539
6.60M
        }
1540
6.60M
    }
1541
1542
132k
    bstr_adjust_len(path, wpos);
1543
1544
132k
    return HTP_OK;
1545
132k
}
1546
1547
45.9k
htp_status_t htp_tx_urldecode_uri_inplace(htp_tx_t *tx, bstr *input) {
1548
45.9k
    uint64_t flags = 0;
1549
1550
45.9k
    htp_status_t rc = htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URL_PATH, input, &flags, &(tx->response_status_expected_number));
1551
1552
45.9k
    if (flags & HTP_URLEN_INVALID_ENCODING) {
1553
9.01k
        tx->flags |= HTP_PATH_INVALID_ENCODING;
1554
9.01k
    }
1555
1556
45.9k
    if (flags & HTP_URLEN_ENCODED_NUL) {
1557
1.12k
        tx->flags |= HTP_PATH_ENCODED_NUL;
1558
1.12k
    }
1559
1560
45.9k
    if (flags & HTP_URLEN_RAW_NUL) {
1561
12.3k
        tx->flags |= HTP_PATH_RAW_NUL;
1562
12.3k
    }
1563
1564
45.9k
    return rc;
1565
45.9k
}
1566
1567
0
htp_status_t htp_tx_urldecode_params_inplace(htp_tx_t *tx, bstr *input) {
1568
0
    return htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URLENCODED, input, &(tx->flags), &(tx->response_status_expected_number));
1569
0
}
1570
1571
10.3k
htp_status_t htp_urldecode_inplace(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags) {
1572
10.3k
    int expected_status_code = 0;
1573
10.3k
    return htp_urldecode_inplace_ex(cfg, ctx, input, flags, &expected_status_code);
1574
10.3k
}
1575
1576
56.2k
htp_status_t htp_urldecode_inplace_ex(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags, int *expected_status_code) {
1577
56.2k
    if (input == NULL) return HTP_ERROR;
1578
1579
56.2k
    unsigned char *data = bstr_ptr(input);
1580
56.2k
    if (data == NULL) return HTP_ERROR;
1581
56.2k
    size_t len = bstr_len(input);
1582
1583
56.2k
    size_t rpos = 0;
1584
56.2k
    size_t wpos = 0;
1585
1586
4.06M
    while ((rpos < len) && (wpos < len)) {
1587
4.00M
        uint8_t c = data[rpos];
1588
1589
        // Decode encoded characters.
1590
4.00M
        if (c == '%') {
1591
            // Need at least 2 additional bytes for %HH.
1592
83.3k
            if (rpos + 2 < len) {
1593
77.9k
                int handled = 0;
1594
1595
                // Decode %uHHHH encoding, but only if allowed in configuration.
1596
77.9k
                if (cfg->decoder_cfgs[ctx].u_encoding_decode) {
1597
                    // The next character must be a case-insensitive u.
1598
0
                    if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1599
0
                        handled = 1;
1600
1601
0
                        if (cfg->decoder_cfgs[ctx].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1602
0
                            (*expected_status_code) = cfg->decoder_cfgs[ctx].u_encoding_unwanted;
1603
0
                        }
1604
1605
                        // Need at least 5 additional bytes for %uHHHH.
1606
0
                        if (rpos + 5 < len) {
1607
0
                            if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1608
0
                                    && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1609
                                // Decode a valid %u encoding.
1610
0
                                c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1611
0
                                rpos += 6;
1612
0
                            } else {
1613
                                // Invalid %u encoding (could not find 4 xdigits).
1614
0
                                (*flags) |= HTP_URLEN_INVALID_ENCODING;
1615
1616
0
                                if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1617
0
                                    (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1618
0
                                }
1619
1620
0
                                switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1621
0
                                    case HTP_URL_DECODE_REMOVE_PERCENT:
1622
                                        // Do not place anything in output; consume the %.
1623
0
                                        rpos++;
1624
0
                                        continue;
1625
0
                                        break;
1626
0
                                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1627
                                        // Leave the % in output.
1628
0
                                        rpos++;
1629
0
                                        break;
1630
0
                                    case HTP_URL_DECODE_PROCESS_INVALID:
1631
                                        // Decode invalid %u encoding.
1632
0
                                        c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1633
0
                                        rpos += 6;
1634
0
                                        break;
1635
0
                                }
1636
0
                            }
1637
0
                        } else {
1638
                            // Invalid %u encoding; not enough data.
1639
0
                            (*flags) |= HTP_URLEN_INVALID_ENCODING;
1640
1641
0
                            if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1642
0
                                (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1643
0
                            }
1644
1645
0
                            switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1646
0
                                case HTP_URL_DECODE_REMOVE_PERCENT:
1647
                                    // Do not place anything in output; consume the %.
1648
0
                                    rpos++;
1649
0
                                    continue;
1650
0
                                    break;
1651
0
                                case HTP_URL_DECODE_PRESERVE_PERCENT:
1652
                                    // Leave the % in output.
1653
0
                                    rpos++;
1654
0
                                    break;
1655
0
                                case HTP_URL_DECODE_PROCESS_INVALID:
1656
                                    // Cannot decode because there's not enough data.
1657
                                    // Leave the % in output.
1658
                                    // TODO Configurable handling of %, u, etc.
1659
0
                                    rpos++;
1660
0
                                    break;
1661
0
                            }
1662
0
                        }
1663
0
                    }
1664
0
                }
1665
1666
                // Handle standard URL encoding.
1667
77.9k
                if (!handled) {
1668
                    // Need 2 hexadecimal digits.
1669
77.9k
                    if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1670
                        // Decode %HH encoding.
1671
18.5k
                        c = x2c(&(data[rpos + 1]));
1672
18.5k
                        rpos += 3;
1673
59.3k
                    } else {
1674
                        // Invalid encoding (enough bytes, but not hexadecimal digits).
1675
59.3k
                        (*flags) |= HTP_URLEN_INVALID_ENCODING;
1676
1677
59.3k
                        if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1678
0
                            (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1679
0
                        }
1680
1681
59.3k
                        switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1682
0
                            case HTP_URL_DECODE_REMOVE_PERCENT:
1683
                                // Do not place anything in output; consume the %.
1684
0
                                rpos++;
1685
0
                                continue;
1686
0
                                break;
1687
59.3k
                            case HTP_URL_DECODE_PRESERVE_PERCENT:
1688
                                // Leave the % in output.
1689
59.3k
                                rpos++;
1690
59.3k
                                break;
1691
0
                            case HTP_URL_DECODE_PROCESS_INVALID:
1692
                                // Decode.
1693
0
                                c = x2c(&(data[rpos + 1]));
1694
0
                                rpos += 3;
1695
0
                                break;
1696
59.3k
                        }
1697
59.3k
                    }
1698
77.9k
                }
1699
77.9k
            } else {
1700
                // Invalid encoding; not enough data (at least 2 bytes required).
1701
5.46k
                (*flags) |= HTP_URLEN_INVALID_ENCODING;
1702
1703
5.46k
                if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1704
0
                    (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1705
0
                }
1706
1707
5.46k
                switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1708
0
                    case HTP_URL_DECODE_REMOVE_PERCENT:
1709
                        // Do not place anything in output; consume the %.
1710
0
                        rpos++;
1711
0
                        continue;
1712
0
                        break;
1713
5.46k
                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1714
                        // Leave the % in output.
1715
5.46k
                        rpos++;
1716
5.46k
                        break;
1717
0
                    case HTP_URL_DECODE_PROCESS_INVALID:
1718
                        // Cannot decode because there's not enough data.
1719
                        // Leave the % in output.
1720
                        // TODO Configurable handling of %, etc.
1721
0
                        rpos++;
1722
0
                        break;
1723
5.46k
                }
1724
5.46k
            }
1725
1726
            // Did we get an encoded NUL byte?
1727
83.3k
            if (c == 0) {
1728
1.58k
                if (cfg->decoder_cfgs[ctx].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1729
0
                    (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_encoded_unwanted;
1730
0
                }
1731
1732
1.58k
                (*flags) |= HTP_URLEN_ENCODED_NUL;
1733
1734
1.58k
                if (cfg->decoder_cfgs[ctx].nul_encoded_terminates) {
1735
                    // Terminate the path at the raw NUL byte.
1736
0
                    bstr_adjust_len(input, wpos);
1737
0
                    return 1;
1738
0
                }
1739
1.58k
            }
1740
1741
83.3k
            data[wpos++] = c;
1742
3.92M
        } else if (c == '+') {
1743
            // Decoding of the plus character is conditional on the configuration.
1744
1745
5.94k
            if (cfg->decoder_cfgs[ctx].plusspace_decode) {
1746
0
                c = 0x20;
1747
0
            }
1748
1749
5.94k
            rpos++;
1750
5.94k
            data[wpos++] = c;
1751
3.91M
        } else {
1752
            // One non-encoded byte.
1753
1754
            // Did we get a raw NUL byte?
1755
3.91M
            if (c == 0) {
1756
556k
                if (cfg->decoder_cfgs[ctx].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1757
0
                    (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_raw_unwanted;
1758
0
                }
1759
1760
556k
                (*flags) |= HTP_URLEN_RAW_NUL;
1761
1762
556k
                if (cfg->decoder_cfgs[ctx].nul_raw_terminates) {
1763
                    // Terminate the path at the encoded NUL byte.
1764
0
                    bstr_adjust_len(input, wpos);
1765
0
                    return HTP_OK;
1766
0
                }
1767
556k
            }
1768
1769
3.91M
            rpos++;
1770
3.91M
            data[wpos++] = c;
1771
3.91M
        }
1772
4.00M
    }
1773
1774
56.2k
    bstr_adjust_len(input, wpos);
1775
1776
56.2k
    return HTP_OK;
1777
56.2k
}
1778
1779
/**
1780
 * Normalize a previously-parsed request URI.
1781
 *
1782
 * @param[in] connp
1783
 * @param[in] incomplete
1784
 * @param[in] normalized
1785
 * @return HTP_OK or HTP_ERROR
1786
 */
1787
211k
int htp_normalize_parsed_uri(htp_tx_t *tx, htp_uri_t *incomplete, htp_uri_t *normalized) {
1788
    // Scheme.
1789
211k
    if (incomplete->scheme != NULL) {
1790
        // Duplicate and convert to lowercase.
1791
41.1k
        normalized->scheme = bstr_dup_lower(incomplete->scheme);
1792
41.1k
        if (normalized->scheme == NULL) return HTP_ERROR;
1793
41.1k
    }
1794
1795
    // Username.
1796
211k
    if (incomplete->username != NULL) {
1797
5.40k
        normalized->username = bstr_dup(incomplete->username);
1798
5.40k
        if (normalized->username == NULL) return HTP_ERROR;
1799
5.40k
        htp_tx_urldecode_uri_inplace(tx, normalized->username);
1800
5.40k
    }
1801
1802
    // Password.
1803
211k
    if (incomplete->password != NULL) {
1804
3.48k
        normalized->password = bstr_dup(incomplete->password);
1805
3.48k
        if (normalized->password == NULL) return HTP_ERROR;
1806
3.48k
        htp_tx_urldecode_uri_inplace(tx, normalized->password);
1807
3.48k
    }
1808
1809
    // Hostname.
1810
211k
    if (incomplete->hostname != NULL) {
1811
        // We know that incomplete->hostname does not contain
1812
        // port information, so no need to check for it here.
1813
28.6k
        normalized->hostname = bstr_dup(incomplete->hostname);
1814
28.6k
        if (normalized->hostname == NULL) return HTP_ERROR;
1815
28.6k
        htp_tx_urldecode_uri_inplace(tx, normalized->hostname);
1816
28.6k
        htp_normalize_hostname_inplace(normalized->hostname);
1817
28.6k
    }
1818
1819
    // Port.
1820
211k
    if (incomplete->port != NULL) {
1821
12.9k
        int64_t port_parsed = htp_parse_positive_integer_whitespace(
1822
12.9k
                bstr_ptr(incomplete->port), bstr_len(incomplete->port), 10);
1823
1824
12.9k
        if (port_parsed < 0) {
1825
            // Failed to parse the port number.
1826
7.03k
            normalized->port_number = -1;
1827
7.03k
            tx->flags |= HTP_HOSTU_INVALID;
1828
7.03k
        } else if ((port_parsed > 0) && (port_parsed < 65536)) {
1829
            // Valid port number.
1830
4.79k
            normalized->port_number = (int) port_parsed;
1831
4.79k
        } else {
1832
            // Port number out of range.
1833
1.15k
            normalized->port_number = -1;
1834
1.15k
            tx->flags |= HTP_HOSTU_INVALID;
1835
1.15k
        }
1836
198k
    } else {
1837
198k
        normalized->port_number = -1;
1838
198k
    }
1839
1840
    // Path.
1841
211k
    if (incomplete->path != NULL) {
1842
        // Make a copy of the path, so that we can work on it.
1843
132k
        normalized->path = bstr_dup(incomplete->path);
1844
132k
        if (normalized->path == NULL) return HTP_ERROR;
1845
1846
        // Decode URL-encoded (and %u-encoded) characters, as well as lowercase,
1847
        // compress separators and convert backslashes.
1848
132k
        htp_decode_path_inplace(tx, normalized->path);
1849
1850
        // Handle UTF-8 in the path.
1851
132k
        if (tx->cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_convert_bestfit) {
1852
            // Decode Unicode characters into a single-byte stream, using best-fit mapping.
1853
0
            htp_utf8_decode_path_inplace(tx->cfg, tx, normalized->path);
1854
132k
        } else {
1855
            // No decoding, but try to validate the path as a UTF-8 stream.
1856
132k
            htp_utf8_validate_path(tx, normalized->path);
1857
132k
        }
1858
1859
        // RFC normalization.
1860
132k
        htp_normalize_uri_path_inplace(normalized->path);
1861
132k
    }
1862
1863
    // Query string.
1864
211k
    if (incomplete->query != NULL) {
1865
10.3k
        normalized->query = bstr_dup(incomplete->query);
1866
10.3k
        if (normalized->query == NULL) return HTP_ERROR;
1867
10.3k
    }
1868
1869
    // Fragment.
1870
211k
    if (incomplete->fragment != NULL) {
1871
8.32k
        normalized->fragment = bstr_dup(incomplete->fragment);
1872
8.32k
        if (normalized->fragment == NULL) return HTP_ERROR;
1873
8.32k
        htp_tx_urldecode_uri_inplace(tx, normalized->fragment);
1874
8.32k
    }
1875
1876
211k
    return HTP_OK;
1877
211k
}
1878
1879
/**
1880
 * Normalize request hostname. Convert all characters to lowercase and
1881
 * remove trailing dots from the end, if present.
1882
 *
1883
 * @param[in] hostname
1884
 * @return Normalized hostname.
1885
 */
1886
28.6k
bstr *htp_normalize_hostname_inplace(bstr *hostname) {
1887
28.6k
    if (hostname == NULL) return NULL;
1888
1889
28.6k
    bstr_to_lowercase(hostname);
1890
1891
    // Remove dots from the end of the string.    
1892
30.4k
    while (bstr_char_at_end(hostname, 0) == '.') bstr_chop(hostname);
1893
1894
28.6k
    return hostname;
1895
28.6k
}
1896
1897
/**
1898
 * Normalize URL path. This function implements the remove dot segments algorithm
1899
 * specified in RFC 3986, section 5.2.4.
1900
 *
1901
 * @param[in] s
1902
 */
1903
132k
void htp_normalize_uri_path_inplace(bstr *s) {
1904
132k
    if (s == NULL) return;
1905
1906
132k
    unsigned char *data = bstr_ptr(s);
1907
132k
    if (data == NULL) return;
1908
132k
    size_t len = bstr_len(s);
1909
1910
132k
    size_t rpos = 0;
1911
132k
    size_t wpos = 0;
1912
1913
132k
    int c = -1;
1914
428k
    while ((rpos < len)&&(wpos < len)) {
1915
295k
        if (c == -1) {
1916
292k
            c = data[rpos++];
1917
292k
        }
1918
1919
        // A. If the input buffer begins with a prefix of "../" or "./",
1920
        //    then remove that prefix from the input buffer; otherwise,
1921
295k
        if (c == '.') {
1922
4.47k
            if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1923
331
                c = -1;
1924
331
                rpos += 2;
1925
331
                continue;
1926
4.14k
            } else if ((rpos < len) && (data[rpos] == '/')) {
1927
960
                c = -1;
1928
960
                rpos += 1;
1929
960
                continue;
1930
960
            }
1931
4.47k
        }
1932
1933
294k
        if (c == '/') {
1934
            // B. if the input buffer begins with a prefix of "/./" or "/.",
1935
            //    where "." is a complete path segment, then replace that
1936
            //    prefix with "/" in the input buffer; otherwise,
1937
231k
            if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1938
999
                c = '/';
1939
999
                rpos += 2;
1940
999
                continue;
1941
230k
            } else if ((rpos + 1 == len) && (data[rpos] == '.')) {
1942
379
                c = '/';
1943
379
                rpos += 1;
1944
379
                continue;
1945
379
            }
1946
1947
            // C. if the input buffer begins with a prefix of "/../" or "/..",
1948
            //    where ".." is a complete path segment, then replace that
1949
            //    prefix with "/" in the input buffer and remove the last
1950
            //    segment and its preceding "/" (if any) from the output
1951
            //    buffer; otherwise,
1952
230k
            if ((rpos + 2 < len) && (data[rpos] == '.') && (data[rpos + 1] == '.') && (data[rpos + 2] == '/')) {
1953
2.04k
                c = '/';
1954
2.04k
                rpos += 3;
1955
1956
                // Remove the last segment
1957
184k
                while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1958
2.04k
                if (wpos > 0) wpos--;
1959
2.04k
                continue;
1960
228k
            } else if ((rpos + 2 == len) && (data[rpos] == '.') && (data[rpos + 1] == '.')) {
1961
2.16k
                c = '/';
1962
2.16k
                rpos += 2;
1963
1964
                // Remove the last segment
1965
17.7k
                while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1966
2.16k
                if (wpos > 0) wpos--;
1967
2.16k
                continue;
1968
2.16k
            }
1969
230k
        }
1970
1971
        // D.  if the input buffer consists only of "." or "..", then remove
1972
        // that from the input buffer; otherwise,
1973
289k
        if ((c == '.') && (rpos == len)) {
1974
1.55k
            rpos++;
1975
1.55k
            continue;
1976
1.55k
        }
1977
1978
287k
        if ((c == '.') && (rpos + 1 == len) && (data[rpos] == '.')) {
1979
171
            rpos += 2;
1980
171
            continue;
1981
171
        }
1982
1983
        // E.  move the first path segment in the input buffer to the end of
1984
        // the output buffer, including the initial "/" character (if
1985
        // any) and any subsequent characters up to, but not including,
1986
        // the next "/" character or the end of the input buffer.
1987
287k
        data[wpos++] = (uint8_t) c;
1988
1989
6.58M
        while ((rpos < len) && (data[rpos] != '/') && (wpos < len)) {
1990
6.30M
            data[wpos++] = data[rpos++];
1991
6.30M
        }
1992
1993
287k
        c = -1;
1994
287k
    }
1995
1996
132k
    bstr_adjust_len(s, wpos);
1997
132k
}
1998
1999
/**
2000
 *
2001
 */
2002
0
void fprint_bstr(FILE *stream, const char *name, bstr *b) {
2003
0
    if (b == NULL) {
2004
0
        fprint_raw_data_ex(stream, name, "(null)", 0, 6);
2005
0
        return;
2006
0
    }
2007
2008
0
    fprint_raw_data_ex(stream, name, bstr_ptr(b), 0, bstr_len(b));
2009
0
}
2010
2011
/**
2012
 *
2013
 */
2014
0
void fprint_raw_data(FILE *stream, const char *name, const void *data, size_t len) {
2015
    // may happen for gaps
2016
0
    if (data == NULL) {
2017
0
        fprintf(stream, "\n%s: ptr NULL len %u\n", name, (unsigned int)len);
2018
0
    } else {
2019
0
        fprint_raw_data_ex(stream, name, data, 0, len);
2020
0
    }
2021
0
}
2022
2023
/**
2024
 *
2025
 */
2026
0
void fprint_raw_data_ex(FILE *stream, const char *name, const void *_data, size_t offset, size_t printlen) {
2027
0
    const unsigned char *data = (const unsigned char *) _data;
2028
0
    char buf[160];
2029
0
    size_t len = offset + printlen;
2030
2031
0
    fprintf(stream, "\n%s: ptr %p offset %u len %u\n", name, (void*) data, (unsigned int)offset, (unsigned int)len);
2032
2033
0
    while (offset < len) {
2034
0
        size_t i;
2035
2036
0
        snprintf(buf, sizeof(buf), "%x" PRIx64, (unsigned int) offset);
2037
0
        strlcat(buf, "  ", sizeof(buf));
2038
2039
0
        i = 0;
2040
0
        while (i < 8) {
2041
0
            if (offset + i < len) {
2042
0
                char step[4];
2043
0
                snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2044
0
                strlcat(buf, step, sizeof(buf));
2045
0
            } else {
2046
0
                strlcat(buf, "   ", sizeof(buf));
2047
0
            }
2048
2049
0
            i++;
2050
0
        }
2051
2052
0
        strlcat(buf, " ", sizeof(buf));
2053
2054
0
        i = 8;
2055
0
        while (i < 16) {
2056
0
            if (offset + i < len) {
2057
0
                char step[4];
2058
0
                snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2059
0
                strlcat(buf, step, sizeof(buf));
2060
0
            } else {
2061
0
                strlcat(buf, "   ", sizeof(buf));
2062
0
            }
2063
2064
0
            i++;
2065
0
        }
2066
2067
0
        strlcat(buf, " |", sizeof(buf));
2068
2069
0
        i = 0;
2070
0
        char *p = buf + strlen(buf);
2071
0
        while ((offset + i < len) && (i < 16)) {
2072
0
            uint8_t c = data[offset + i];
2073
2074
0
            if (isprint(c)) {
2075
0
                *p++ = c;
2076
0
            } else {
2077
0
                *p++ = '.';
2078
0
            }
2079
2080
0
            i++;
2081
0
        }
2082
2083
0
        *p++ = '|';
2084
0
        *p++ = '\n';
2085
0
        *p = '\0';
2086
2087
0
        fprintf(stream, "%s", buf);
2088
0
        offset += 16;
2089
0
    }
2090
2091
0
    fprintf(stream, "\n");
2092
0
}
2093
2094
/**
2095
 *
2096
 */
2097
0
char *htp_connp_in_state_as_string(htp_connp_t *connp) {
2098
0
    if (connp == NULL) return "NULL";
2099
2100
0
    if (connp->in_state == htp_connp_REQ_IDLE) return "REQ_IDLE";
2101
0
    if (connp->in_state == htp_connp_REQ_LINE) return "REQ_LINE";
2102
0
    if (connp->in_state == htp_connp_REQ_PROTOCOL) return "REQ_PROTOCOL";
2103
0
    if (connp->in_state == htp_connp_REQ_HEADERS) return "REQ_HEADERS";
2104
0
    if (connp->in_state == htp_connp_REQ_CONNECT_CHECK) return "REQ_CONNECT_CHECK";
2105
0
    if (connp->in_state == htp_connp_REQ_CONNECT_WAIT_RESPONSE) return "REQ_CONNECT_WAIT_RESPONSE";
2106
0
    if (connp->in_state == htp_connp_REQ_BODY_DETERMINE) return "REQ_BODY_DETERMINE";
2107
0
    if (connp->in_state == htp_connp_REQ_BODY_IDENTITY) return "REQ_BODY_IDENTITY";
2108
0
    if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_LENGTH) return "REQ_BODY_CHUNKED_LENGTH";
2109
0
    if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA) return "REQ_BODY_CHUNKED_DATA";
2110
0
    if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA_END) return "REQ_BODY_CHUNKED_DATA_END";
2111
0
    if (connp->in_state == htp_connp_REQ_FINALIZE) return "REQ_FINALIZE";
2112
0
    if (connp->in_state == htp_connp_REQ_IGNORE_DATA_AFTER_HTTP_0_9) return "REQ_IGNORE_DATA_AFTER_HTTP_0_9";
2113
2114
0
    return "UNKNOWN";
2115
0
}
2116
2117
/**
2118
 *
2119
 */
2120
0
char *htp_connp_out_state_as_string(htp_connp_t *connp) {
2121
0
    if (connp == NULL) return "NULL";
2122
2123
0
    if (connp->out_state == htp_connp_RES_IDLE) return "RES_IDLE";
2124
0
    if (connp->out_state == htp_connp_RES_LINE) return "RES_LINE";
2125
0
    if (connp->out_state == htp_connp_RES_HEADERS) return "RES_HEADERS";
2126
0
    if (connp->out_state == htp_connp_RES_BODY_DETERMINE) return "RES_BODY_DETERMINE";
2127
0
    if (connp->out_state == htp_connp_RES_BODY_IDENTITY_CL_KNOWN) return "RES_BODY_IDENTITY_CL_KNOWN";
2128
0
    if (connp->out_state == htp_connp_RES_BODY_IDENTITY_STREAM_CLOSE) return "RES_BODY_IDENTITY_STREAM_CLOSE";
2129
0
    if (connp->out_state == htp_connp_RES_BODY_CHUNKED_LENGTH) return "RES_BODY_CHUNKED_LENGTH";
2130
0
    if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA) return "RES_BODY_CHUNKED_DATA";
2131
0
    if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA_END) return "RES_BODY_CHUNKED_DATA_END";
2132
0
    if (connp->out_state == htp_connp_RES_FINALIZE) return "RES_BODY_FINALIZE";
2133
2134
0
    return "UNKNOWN";
2135
0
}
2136
2137
/**
2138
 *
2139
 */
2140
0
char *htp_tx_request_progress_as_string(htp_tx_t *tx) {
2141
0
    if (tx == NULL) return "NULL";
2142
2143
0
    switch (tx->request_progress) {
2144
0
        case HTP_REQUEST_NOT_STARTED:
2145
0
            return "NOT_STARTED";
2146
0
        case HTP_REQUEST_LINE:
2147
0
            return "REQ_LINE";
2148
0
        case HTP_REQUEST_HEADERS:
2149
0
            return "REQ_HEADERS";
2150
0
        case HTP_REQUEST_BODY:
2151
0
            return "REQ_BODY";
2152
0
        case HTP_REQUEST_TRAILER:
2153
0
            return "REQ_TRAILER";
2154
0
        case HTP_REQUEST_COMPLETE:
2155
0
            return "COMPLETE";
2156
0
    }
2157
2158
0
    return "INVALID";
2159
0
}
2160
2161
/**
2162
 *
2163
 */
2164
0
char *htp_tx_response_progress_as_string(htp_tx_t *tx) {
2165
0
    if (tx == NULL) return "NULL";
2166
2167
0
    switch (tx->response_progress) {
2168
0
        case HTP_RESPONSE_NOT_STARTED:
2169
0
            return "NOT_STARTED";
2170
0
        case HTP_RESPONSE_LINE:
2171
0
            return "RES_LINE";
2172
0
        case HTP_RESPONSE_HEADERS:
2173
0
            return "RES_HEADERS";
2174
0
        case HTP_RESPONSE_BODY:
2175
0
            return "RES_BODY";
2176
0
        case HTP_RESPONSE_TRAILER:
2177
0
            return "RES_TRAILER";
2178
0
        case HTP_RESPONSE_COMPLETE:
2179
0
            return "COMPLETE";
2180
0
    }
2181
2182
0
    return "INVALID";
2183
0
}
2184
2185
0
bstr *htp_unparse_uri_noencode(htp_uri_t *uri) {
2186
0
    if (uri == NULL) return NULL;    
2187
2188
    // On the first pass determine the length of the final string
2189
0
    size_t len = 0;
2190
2191
0
    if (uri->scheme != NULL) {
2192
0
        len += bstr_len(uri->scheme);
2193
0
        len += 3; // "://"
2194
0
    }
2195
2196
0
    if ((uri->username != NULL) || (uri->password != NULL)) {
2197
0
        if (uri->username != NULL) {
2198
0
            len += bstr_len(uri->username);
2199
0
        }
2200
2201
0
        len += 1; // ":"
2202
2203
0
        if (uri->password != NULL) {
2204
0
            len += bstr_len(uri->password);
2205
0
        }
2206
2207
0
        len += 1; // "@"
2208
0
    }
2209
2210
0
    if (uri->hostname != NULL) {
2211
0
        len += bstr_len(uri->hostname);
2212
0
    }
2213
2214
0
    if (uri->port != NULL) {
2215
0
        len += 1; // ":"
2216
0
        len += bstr_len(uri->port);
2217
0
    }
2218
2219
0
    if (uri->path != NULL) {
2220
0
        len += bstr_len(uri->path);
2221
0
    }
2222
2223
0
    if (uri->query != NULL) {
2224
0
        len += 1; // "?"
2225
0
        len += bstr_len(uri->query);
2226
0
    }
2227
2228
0
    if (uri->fragment != NULL) {
2229
0
        len += 1; // "#"
2230
0
        len += bstr_len(uri->fragment);
2231
0
    }
2232
2233
    // On the second pass construct the string
2234
0
    bstr *r = bstr_alloc(len);
2235
0
    if (r == NULL) return NULL;    
2236
2237
0
    if (uri->scheme != NULL) {
2238
0
        bstr_add_noex(r, uri->scheme);
2239
0
        bstr_add_c_noex(r, "://");
2240
0
    }
2241
2242
0
    if ((uri->username != NULL) || (uri->password != NULL)) {
2243
0
        if (uri->username != NULL) {
2244
0
            bstr_add_noex(r, uri->username);
2245
0
        }
2246
2247
0
        bstr_add_c_noex(r, ":");
2248
2249
0
        if (uri->password != NULL) {
2250
0
            bstr_add_noex(r, uri->password);
2251
0
        }
2252
2253
0
        bstr_add_c_noex(r, "@");
2254
0
    }
2255
2256
0
    if (uri->hostname != NULL) {
2257
0
        bstr_add_noex(r, uri->hostname);
2258
0
    }
2259
2260
0
    if (uri->port != NULL) {
2261
0
        bstr_add_c_noex(r, ":");
2262
0
        bstr_add_noex(r, uri->port);
2263
0
    }
2264
2265
0
    if (uri->path != NULL) {
2266
0
        bstr_add_noex(r, uri->path);
2267
0
    }
2268
2269
0
    if (uri->query != NULL) {
2270
0
        bstr_add_c_noex(r, "?");
2271
0
        bstr_add_noex(r, uri->query);
2272
0
    }
2273
2274
0
    if (uri->fragment != NULL) {
2275
0
        bstr_add_c_noex(r, "#");
2276
0
        bstr_add_noex(r, uri->fragment);
2277
0
    }
2278
2279
0
    return r;
2280
0
}
2281
2282
/**
2283
 * Determine if the information provided on the response line
2284
 * is good enough. Browsers are lax when it comes to response
2285
 * line parsing. In most cases they will only look for the
2286
 * words "http" at the beginning.
2287
 *
2288
 * @param[in] data pointer to bytearray
2289
 * @param[in] len length in bytes of data
2290
 * @return 1 for good enough or 0 for not good enough
2291
 */
2292
732k
int htp_treat_response_line_as_body(const uint8_t *data, size_t len) {
2293
    // Browser behavior:
2294
    //      Firefox 3.5.x: (?i)^\s*http
2295
    //      IE: (?i)^\s*http\s*/
2296
    //      Safari: ^HTTP/\d+\.\d+\s+\d{3}
2297
732k
    size_t pos = 0;
2298
2299
732k
    if (data == NULL) return 1;
2300
1.24M
    while ((pos < len) && (htp_is_space(data[pos]) || data[pos] == 0)) pos++;
2301
2302
732k
    if (len < pos + 4) return 1;
2303
2304
526k
    if ((data[pos] != 'H') && (data[pos] != 'h')) return 1;
2305
113k
    if ((data[pos+1] != 'T') && (data[pos+1] != 't')) return 1;
2306
89.0k
    if ((data[pos+2] != 'T') && (data[pos+2] != 't')) return 1;
2307
86.7k
    if ((data[pos+3] != 'P') && (data[pos+3] != 'p')) return 1;
2308
2309
83.8k
    return 0;
2310
86.7k
}
2311
2312
/**
2313
 * Run the REQUEST_BODY_DATA hook.
2314
 *
2315
 * @param[in] connp
2316
 * @param[in] d
2317
 */
2318
754k
htp_status_t htp_req_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2319
    // Do not invoke callbacks with an empty data chunk
2320
754k
    if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2321
2322
    // Do not invoke callbacks without a transaction.
2323
752k
    if (connp->in_tx == NULL) return HTP_OK;
2324
2325
    // Run transaction hooks first
2326
752k
    htp_status_t rc = htp_hook_run_all(connp->in_tx->hook_request_body_data, d);
2327
752k
    if (rc != HTP_OK) return rc;
2328
2329
    // Run configuration hooks second
2330
752k
    rc = htp_hook_run_all(connp->cfg->hook_request_body_data, d);
2331
752k
    if (rc != HTP_OK) return rc;
2332
2333
    // On PUT requests, treat request body as file
2334
752k
    if (connp->put_file != NULL) {
2335
317
        htp_file_data_t file_data;
2336
2337
317
        file_data.data = d->data;
2338
317
        file_data.len = d->len;
2339
317
        file_data.file = connp->put_file;
2340
317
        file_data.file->len += d->len;
2341
2342
317
        rc = htp_hook_run_all(connp->cfg->hook_request_file_data, &file_data);
2343
317
        if (rc != HTP_OK) return rc;
2344
317
    }
2345
2346
752k
    return HTP_OK;
2347
752k
}
2348
2349
/**
2350
 * Run the RESPONSE_BODY_DATA hook.
2351
 *
2352
 * @param[in] connp
2353
 * @param[in] d
2354
 */
2355
984k
htp_status_t htp_res_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2356
    // Do not invoke callbacks with an empty data chunk.
2357
984k
    if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2358
2359
    // Run transaction hooks first
2360
980k
    htp_status_t rc = htp_hook_run_all(connp->out_tx->hook_response_body_data, d);
2361
980k
    if (rc != HTP_OK) return rc;
2362
2363
    // Run configuration hooks second
2364
980k
    rc = htp_hook_run_all(connp->cfg->hook_response_body_data, d);
2365
980k
    if (rc != HTP_OK) return rc;
2366
2367
980k
    return HTP_OK;
2368
980k
}
2369
2370
/**
2371
 * Parses the provided memory region, extracting the double-quoted string.
2372
 *
2373
 * @param[in] data
2374
 * @param[in] len
2375
 * @param[out] out
2376
 * @param[out] endoffset
2377
 * @return HTP_OK on success, HTP_DECLINED if the input is not well formed, and HTP_ERROR on fatal errors.
2378
 */
2379
259
htp_status_t htp_extract_quoted_string_as_bstr(unsigned char *data, size_t len, bstr **out, size_t *endoffset) {
2380
259
    if ((data == NULL) || (out == NULL)) return HTP_ERROR;
2381
2382
259
    if (len == 0) return HTP_DECLINED;
2383
2384
259
    size_t pos = 0;
2385
2386
    // Check that the first character is a double quote.
2387
259
    if (data[pos] != '"') return HTP_DECLINED;
2388
2389
    // Step over the double quote.
2390
259
    pos++;
2391
259
    if (pos == len) return HTP_DECLINED;
2392
2393
    // Calculate the length of the resulting string.
2394
259
    size_t escaped_chars = 0;
2395
92.2k
    while (pos < len) {
2396
92.2k
        if (data[pos] == '\\') {
2397
104
            if (pos + 1 < len) {
2398
104
                escaped_chars++;
2399
104
                pos += 2;
2400
104
                continue;
2401
104
            }
2402
92.1k
        } else if (data[pos] == '"') {
2403
210
            break;
2404
210
        }
2405
2406
91.9k
        pos++;
2407
91.9k
    }
2408
2409
    // Have we reached the end of input without seeing the terminating double quote?
2410
259
    if (pos == len) return HTP_DECLINED;
2411
2412
    // Copy the data and unescape it as necessary.
2413
210
    size_t outlen = pos - 1 - escaped_chars;
2414
210
    *out = bstr_alloc(outlen);
2415
210
    if (*out == NULL) return HTP_ERROR;
2416
210
    unsigned char *outptr = bstr_ptr(*out);
2417
210
    size_t outpos = 0;
2418
2419
210
    pos = 1;
2420
74.4k
    while ((pos < len) && (outpos < outlen)) {
2421
        // TODO We are not properly unescaping test here, we're only
2422
        //      handling escaped double quotes.
2423
74.2k
        if (data[pos] == '\\') {
2424
83
            if (pos + 1 < len) {
2425
83
                outptr[outpos++] = data[pos + 1];
2426
83
                pos += 2;
2427
83
                continue;
2428
83
            }
2429
74.1k
        } else if (data[pos] == '"') {
2430
0
            break;
2431
0
        }
2432
2433
74.1k
        outptr[outpos++] = data[pos++];
2434
74.1k
    }
2435
2436
210
    bstr_adjust_len(*out, outlen);
2437
2438
210
    if (endoffset != NULL) {
2439
0
        *endoffset = pos;
2440
0
    }
2441
2442
210
    return HTP_OK;
2443
210
}
2444
2445
11.2k
htp_status_t htp_parse_ct_header(bstr *header, bstr **ct) {
2446
11.2k
    if ((header == NULL) || (ct == NULL)) return HTP_ERROR;
2447
2448
11.2k
    unsigned char *data = bstr_ptr(header);
2449
11.2k
    size_t len = bstr_len(header);
2450
2451
    // The assumption here is that the header value we receive
2452
    // here has been left-trimmed, which means the starting position
2453
    // is on the media type. On some platforms that may not be the
2454
    // case, and we may need to do the left-trim ourselves.
2455
2456
    // Find the end of the MIME type, using the same approach PHP 5.4.3 uses.
2457
11.2k
    size_t pos = 0;
2458
132k
    while ((pos < len) && (data[pos] != ';') && (data[pos] != ',') && (data[pos] != ' ')) pos++;
2459
2460
11.2k
    *ct = bstr_dup_ex(header, 0, pos);
2461
11.2k
    if (*ct == NULL) return HTP_ERROR;
2462
2463
11.2k
    bstr_to_lowercase(*ct);
2464
2465
11.2k
    return HTP_OK;
2466
11.2k
}
2467
2468
/**
2469
 * Implements relaxed (not strictly RFC) hostname validation.
2470
 * 
2471
 * @param[in] hostname
2472
 * @return 1 if the supplied hostname is valid; 0 if it is not.
2473
 */
2474
62.2k
int htp_validate_hostname(bstr *hostname) {
2475
62.2k
    unsigned char *data = bstr_ptr(hostname);
2476
62.2k
    size_t len = bstr_len(hostname);
2477
62.2k
    size_t startpos = 0;
2478
62.2k
    size_t pos = 0;
2479
2480
62.2k
    if ((len == 0) || (len > 255)) return 0;
2481
2482
56.4k
    if (data[0] == '[') {
2483
        // only ipv6 possible
2484
3.74k
        if (len < 2 || len - 2 >= INET6_ADDRSTRLEN) {
2485
847
            return 0;
2486
847
        }
2487
2.89k
        char dst[sizeof(struct in6_addr)];
2488
2.89k
        char str[INET6_ADDRSTRLEN];
2489
2.89k
        memcpy(str, data+1, len-2);
2490
2.89k
        str[len-2] = 0;
2491
2.89k
        return inet_pton(AF_INET6, str, dst);
2492
3.74k
    }
2493
112k
    while (pos < len) {
2494
        // Validate label characters.
2495
111k
        startpos = pos;
2496
659k
        while ((pos < len) && (data[pos] != '.')) {
2497
571k
            unsigned char c = data[pos];
2498
            // According to the RFC, the underscore is not allowed in a label, but
2499
            // we allow it here because we think it's often seen in practice.
2500
571k
            if (!(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
2501
83.2k
                        ((c >= '0') && (c <= '9')) ||
2502
32.2k
                         (c == '-') || (c == '_')))
2503
23.7k
            {
2504
23.7k
                return 0;
2505
23.7k
            }
2506
2507
547k
            pos++;
2508
547k
        }
2509
2510
        // Validate label length.
2511
87.8k
        if ((pos - startpos == 0) || (pos - startpos > 63)) return 0;
2512
2513
87.2k
        if (pos >= len) return 1; // No more data after label.
2514
2515
        // How many dots are there?
2516
60.5k
        startpos = pos;
2517
121k
        while ((pos < len) && (data[pos] == '.')) pos++;
2518
2519
60.5k
        if (pos - startpos != 1) return 0; // Exactly one dot expected.
2520
60.5k
    }
2521
2522
1.39k
    return 1;
2523
52.6k
}
2524
2525
544k
void htp_uri_free(htp_uri_t *uri) {
2526
544k
    if (uri == NULL) return;
2527
2528
538k
    bstr_free(uri->scheme);
2529
538k
    bstr_free(uri->username);
2530
538k
    bstr_free(uri->password);
2531
538k
    bstr_free(uri->hostname);
2532
538k
    bstr_free(uri->port);
2533
538k
    bstr_free(uri->path);
2534
538k
    bstr_free(uri->query);
2535
538k
    bstr_free(uri->fragment);
2536
2537
538k
    free(uri);
2538
538k
}
2539
2540
538k
htp_uri_t *htp_uri_alloc(void) {
2541
538k
    htp_uri_t *u = calloc(1, sizeof (htp_uri_t));
2542
538k
    if (u == NULL) return NULL;
2543
2544
538k
    u->port_number = -1;
2545
2546
538k
    return u;
2547
538k
}
2548
2549
0
char *htp_get_version(void) {
2550
0
    return HTP_VERSION_STRING_FULL;
2551
0
}
2552
2553
/**
2554
 * Tells if a header value (haystack) contains a token (needle)
2555
 * This is done with a caseless comparison
2556
 *
2557
 * @param[in] hvp header value pointer
2558
 * @param[in] hvlen length of header value buffer
2559
 * @param[in] value token to look for (null-terminated string), should be a lowercase constant
2560
 * @return HTP_OK if the header has the token; HTP_ERROR if it has not.
2561
 */
2562
1.97k
htp_status_t htp_header_has_token(const unsigned char *hvp, size_t hvlen, const unsigned char *value) {
2563
1.97k
    int state = 0;
2564
    // offset to compare in value
2565
1.97k
    size_t v_off = 0;
2566
    // The header value is a list of comma-separated tokens (with additional spaces)
2567
55.3k
    for (size_t i = 0; i < hvlen; i++) {
2568
53.9k
        switch (state) {
2569
20.4k
            case 0:
2570
20.4k
                if (v_off == 0 && htp_is_space(hvp[i])) {
2571
                    // skip leading space
2572
2.15k
                    continue;
2573
2.15k
                }
2574
18.3k
                if (tolower(hvp[i]) == value[v_off]) {
2575
16.5k
                    v_off++;
2576
16.5k
                    if (value[v_off] == 0) {
2577
                        // finish validation if end of token
2578
2.29k
                        state = 2;
2579
2.29k
                    }
2580
16.5k
                    continue;
2581
16.5k
                } else {
2582
                    // wait for a new token
2583
1.73k
                    v_off = 0;
2584
1.73k
                    state = 1;
2585
1.73k
                }
2586
                // fallthrough
2587
30.4k
            case 1:
2588
30.4k
                if (hvp[i] == ',') {
2589
                    // start of next token
2590
2.06k
                    state = 0;
2591
2.06k
                }
2592
30.4k
                break;
2593
4.73k
            case 2:
2594
4.73k
                if (hvp[i] == ',') {
2595
558
                    return HTP_OK;
2596
558
                }
2597
4.17k
                if (!htp_is_space(hvp[i])) {
2598
                    // trailing junk in token, wait for a next one
2599
469
                    v_off = 0;
2600
469
                    state = 1;
2601
469
                }
2602
53.9k
        }
2603
53.9k
    }
2604
1.41k
    if (state == 2) {
2605
1.27k
        return HTP_OK;
2606
1.27k
    }
2607
142
    return HTP_ERROR;
2608
1.41k
}