Coverage Report

Created: 2025-07-12 06:24

/src/libhtp/htp/htp_util.c
Line
Count
Source (jump to first uncovered line)
1
/***************************************************************************
2
 * Copyright (c) 2009-2010 Open Information Security Foundation
3
 * Copyright (c) 2010-2013 Qualys, Inc.
4
 * All rights reserved.
5
 * 
6
 * Redistribution and use in source and binary forms, with or without
7
 * modification, are permitted provided that the following conditions are
8
 * met:
9
 * 
10
 * - Redistributions of source code must retain the above copyright
11
 *   notice, this list of conditions and the following disclaimer.
12
13
 * - Redistributions in binary form must reproduce the above copyright
14
 *   notice, this list of conditions and the following disclaimer in the
15
 *   documentation and/or other materials provided with the distribution.
16
17
 * - Neither the name of the Qualys, Inc. nor the names of its
18
 *   contributors may be used to endorse or promote products derived from
19
 *   this software without specific prior written permission.
20
 * 
21
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25
 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
 ***************************************************************************/
33
34
/**
35
 * @file
36
 * @author Ivan Ristic <ivanr@webkreator.com>
37
 */
38
39
#include "htp_config_auto.h"
40
41
//inet_pton
42
#if _WIN32
43
#include <ws2tcpip.h>
44
#else // mac, linux, freebsd
45
#include <sys/types.h>
46
#include <sys/socket.h>
47
#include <netinet/in.h>
48
#include <arpa/inet.h>
49
#endif
50
51
#include "htp_private.h"
52
53
/**
54
 * Is character a linear white space character?
55
 *
56
 * @param[in] c
57
 * @return 0 or 1
58
 */
59
2.15M
int htp_is_lws(int c) {
60
2.15M
    if ((c == ' ') || (c == '\t')) return 1;
61
2.12M
    else return 0;
62
2.15M
}
63
64
/**
65
 * Is character a separator character?
66
 *
67
 * @param[in] c
68
 * @return 0 or 1
69
 */
70
457k
int htp_is_separator(int c) {
71
    /* separators = "(" | ")" | "<" | ">" | "@"
72
                  | "," | ";" | ":" | "\" | <">
73
                  | "/" | "[" | "]" | "?" | "="
74
                  | "{" | "}" | SP | HT         */
75
457k
    switch (c) {
76
608
        case '(':
77
1.44k
        case ')':
78
2.59k
        case '<':
79
6.55k
        case '>':
80
8.43k
        case '@':
81
8.92k
        case ',':
82
9.25k
        case ';':
83
9.25k
        case ':':
84
9.46k
        case '\\':
85
9.84k
        case '"':
86
10.7k
        case '/':
87
11.0k
        case '[':
88
11.5k
        case ']':
89
15.4k
        case '?':
90
16.2k
        case '=':
91
16.6k
        case '{':
92
17.2k
        case '}':
93
17.7k
        case ' ':
94
17.7k
        case '\t':
95
17.7k
            return 1;
96
0
            break;
97
440k
        default:
98
440k
            return 0;
99
457k
    }
100
457k
}
101
102
/**
103
 * Is character a text character?
104
 *
105
 * @param[in] c
106
 * @return 0 or 1
107
 */
108
0
int htp_is_text(int c) {
109
0
    if (c == '\t') return 1;
110
0
    if (c < 32) return 0;
111
0
    return 1;
112
0
}
113
114
/**
115
 * Is character a token character?
116
 *
117
 * @param[in] c
118
 * @return 0 or 1
119
 */
120
507k
int htp_is_token(int c) {
121
    /* token = 1*<any CHAR except CTLs or separators> */
122
    /* CHAR  = <any US-ASCII character (octets 0 - 127)> */
123
507k
    if ((c < 32) || (c > 126)) return 0;
124
457k
    if (htp_is_separator(c)) return 0;
125
440k
    return 1;
126
457k
}
127
128
/**
129
 * Remove all line terminators (LF, CR or CRLF) from
130
 * the end of the line provided as input.
131
 *
132
 * @return 0 if nothing was removed, 1 if one or more LF characters were removed, or
133
 *         2 if one or more CR and/or LF characters were removed.
134
 */
135
1.60M
int htp_chomp(unsigned char *data, size_t *len) {
136
1.60M
    int r = 0;
137
138
    // Loop until there's no more stuff in the buffer
139
2.84M
    while (*len > 0) {
140
        // Try one LF first
141
2.75M
        if (data[*len - 1] == LF) {
142
928k
            (*len)--;
143
928k
            r = 1;
144
145
928k
            if (*len == 0) return r;
146
147
            // A CR is allowed before LF
148
911k
            if (data[*len - 1] == CR) {
149
74.6k
                (*len)--;
150
74.6k
                r = 2;
151
74.6k
            }
152
1.82M
        } else if (data[*len - 1] == CR) {
153
332k
            (*len)--;
154
332k
            r = 1;
155
1.49M
        } else return r;
156
2.75M
    }
157
158
93.9k
    return r;
159
1.60M
}
160
161
/**
162
 * Is character a white space character?
163
 *
164
 * @param[in] c
165
 * @return 0 or 1
166
 */
167
40.5M
int htp_is_space(int c) {
168
40.5M
    switch (c) {
169
83.3k
        case ' ':
170
120k
        case '\f':
171
195k
        case '\v':
172
239k
        case '\t':
173
298k
        case '\r':
174
1.48M
        case '\n':
175
1.48M
            return 1;
176
39.0M
        default:
177
39.0M
            return 0;
178
40.5M
    }
179
40.5M
}
180
181
/**
182
 * Converts request method, given as a string, into a number.
183
 *
184
 * @param[in] method
185
 * @return Method number of M_UNKNOWN
186
 */
187
171k
int htp_convert_method_to_number(bstr *method) {
188
171k
    if (method == NULL) return HTP_M_UNKNOWN;
189
190
    // TODO Optimize using parallel matching, or something similar.
191
192
171k
    if (bstr_cmp_c(method, "GET") == 0) return HTP_M_GET;
193
170k
    if (bstr_cmp_c(method, "PUT") == 0) return HTP_M_PUT;
194
168k
    if (bstr_cmp_c(method, "POST") == 0) return HTP_M_POST;
195
168k
    if (bstr_cmp_c(method, "DELETE") == 0) return HTP_M_DELETE;
196
168k
    if (bstr_cmp_c(method, "CONNECT") == 0) return HTP_M_CONNECT;
197
162k
    if (bstr_cmp_c(method, "OPTIONS") == 0) return HTP_M_OPTIONS;
198
162k
    if (bstr_cmp_c(method, "TRACE") == 0) return HTP_M_TRACE;
199
162k
    if (bstr_cmp_c(method, "PATCH") == 0) return HTP_M_PATCH;
200
162k
    if (bstr_cmp_c(method, "PROPFIND") == 0) return HTP_M_PROPFIND;
201
162k
    if (bstr_cmp_c(method, "PROPPATCH") == 0) return HTP_M_PROPPATCH;
202
162k
    if (bstr_cmp_c(method, "MKCOL") == 0) return HTP_M_MKCOL;
203
162k
    if (bstr_cmp_c(method, "COPY") == 0) return HTP_M_COPY;
204
162k
    if (bstr_cmp_c(method, "MOVE") == 0) return HTP_M_MOVE;
205
162k
    if (bstr_cmp_c(method, "LOCK") == 0) return HTP_M_LOCK;
206
161k
    if (bstr_cmp_c(method, "UNLOCK") == 0) return HTP_M_UNLOCK;
207
160k
    if (bstr_cmp_c(method, "VERSION-CONTROL") == 0) return HTP_M_VERSION_CONTROL;
208
160k
    if (bstr_cmp_c(method, "CHECKOUT") == 0) return HTP_M_CHECKOUT;
209
160k
    if (bstr_cmp_c(method, "UNCHECKOUT") == 0) return HTP_M_UNCHECKOUT;
210
160k
    if (bstr_cmp_c(method, "CHECKIN") == 0) return HTP_M_CHECKIN;
211
160k
    if (bstr_cmp_c(method, "UPDATE") == 0) return HTP_M_UPDATE;
212
160k
    if (bstr_cmp_c(method, "LABEL") == 0) return HTP_M_LABEL;
213
160k
    if (bstr_cmp_c(method, "REPORT") == 0) return HTP_M_REPORT;
214
160k
    if (bstr_cmp_c(method, "MKWORKSPACE") == 0) return HTP_M_MKWORKSPACE;
215
160k
    if (bstr_cmp_c(method, "MKACTIVITY") == 0) return HTP_M_MKACTIVITY;
216
160k
    if (bstr_cmp_c(method, "BASELINE-CONTROL") == 0) return HTP_M_BASELINE_CONTROL;
217
160k
    if (bstr_cmp_c(method, "MERGE") == 0) return HTP_M_MERGE;
218
160k
    if (bstr_cmp_c(method, "INVALID") == 0) return HTP_M_INVALID;
219
160k
    if (bstr_cmp_c(method, "HEAD") == 0) return HTP_M_HEAD;
220
221
159k
    return HTP_M_UNKNOWN;
222
160k
}
223
224
/**
225
 * Is the given line empty?
226
 *
227
 * @param[in] data
228
 * @param[in] len
229
 * @return 0 or 1
230
 */
231
1.25M
int htp_is_line_empty(unsigned char *data, size_t len) {
232
1.25M
    if (((len == 1) && ((data[0] == CR) || (data[0] == LF))) ||
233
1.25M
        ((len == 2) && (data[0] == CR) && (data[1] == LF))) {
234
258k
        return 1;
235
258k
    }
236
237
996k
    return 0;
238
1.25M
}
239
240
/**
241
 * Does line consist entirely of whitespace characters?
242
 * 
243
 * @param[in] data
244
 * @param[in] len
245
 * @return 0 or 1
246
 */
247
0
int htp_is_line_whitespace(unsigned char *data, size_t len) {
248
0
    size_t i;
249
250
0
    for (i = 0; i < len; i++) {
251
0
        if (!isspace(data[i])) {
252
0
            return 0;
253
0
        }
254
0
    }
255
256
0
    return 1;
257
0
}
258
259
/**
260
 * Parses Content-Length string (positive decimal number).
261
 * White space is allowed before and after the number.
262
 *
263
 * @param[in] b
264
 * @return Content-Length as a number, or -1 on error.
265
 */
266
0
int64_t htp_parse_content_length(bstr *b, htp_connp_t *connp) {
267
0
    size_t len = bstr_len(b);
268
0
    unsigned char * data = (unsigned char *) bstr_ptr(b);
269
0
    size_t pos = 0;
270
0
    int64_t r = 0;
271
272
0
    if (len == 0) return -1003;
273
274
    // Ignore junk before
275
0
    while ((pos < len) && (data[pos] < '0' || data[pos] > '9')) {
276
0
        if (!htp_is_lws(data[pos]) && connp != NULL && r == 0) {
277
0
            htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
278
0
                    "C-L value with extra data in the beginning");
279
0
            r = -1;
280
0
        }
281
0
        pos++;
282
0
    }
283
0
    if (pos == len) return -1001;
284
285
0
    r = bstr_util_mem_to_pint(data + pos, len - pos, 10, &pos);
286
    // Ok to have junk afterwards
287
0
    if (pos < len && connp != NULL) {
288
0
        htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
289
0
                "C-L value with extra data in the end");
290
0
    }
291
0
    return r;
292
0
}
293
294
/**
295
 * Parses chunk length (positive hexadecimal number). White space is allowed before
296
 * and after the number. An error will be returned if the chunk length is greater than
297
 * INT32_MAX.
298
 *
299
 * @param[in] data
300
 * @param[in] len
301
 * @return Chunk length, or a negative number on error.
302
 */
303
0
int64_t htp_parse_chunked_length(unsigned char *data, size_t len, int *extension) {
304
    // skip leading line feeds and other control chars
305
0
    while (len) {
306
0
        unsigned char c = *data;
307
0
        if (!(c == 0x0d || c == 0x0a || c == 0x20 || c == 0x09 || c == 0x0b || c == 0x0c))
308
0
            break;
309
0
        data++;
310
0
        len--;
311
0
    }
312
0
    if (len == 0)
313
0
        return -1004;
314
315
    // find how much of the data is correctly formatted
316
0
    size_t i = 0;
317
0
    while (i < len) {
318
0
        unsigned char c = data[i];
319
0
        if (!(isdigit(c) ||
320
0
            (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')))
321
0
            break;
322
0
        i++;
323
0
    }
324
    // cut off trailing junk
325
0
    if (i != len) {
326
0
        if (extension) {
327
0
            size_t j = i;
328
0
            while (j < len) {
329
0
                if (data[j] == ';') {
330
0
                    *extension = 1;
331
0
                    break;
332
0
                }
333
0
                j++;
334
0
            }
335
0
        }
336
0
        len = i;
337
0
    }
338
339
0
    int64_t chunk_len = htp_parse_positive_integer_whitespace(data, len, 16);
340
0
    if (chunk_len < 0) return chunk_len;
341
0
    if (chunk_len > INT32_MAX) return -1;
342
0
    return chunk_len;
343
0
}
344
345
/**
346
 * A somewhat forgiving parser for a positive integer in a given base.
347
 * Only LWS is allowed before and after the number.
348
 * 
349
 * @param[in] data
350
 * @param[in] len
351
 * @param[in] base
352
 * @return The parsed number on success; a negative number on error.
353
 */
354
56.6k
int64_t htp_parse_positive_integer_whitespace(unsigned char *data, size_t len, int base) {
355
56.6k
    if (len == 0) return -1003;
356
357
55.7k
    size_t last_pos;
358
55.7k
    size_t pos = 0;
359
360
    // Ignore LWS before
361
56.5k
    while ((pos < len) && (htp_is_lws(data[pos]))) pos++;
362
55.7k
    if (pos == len) return -1001;
363
364
55.5k
    int64_t r = bstr_util_mem_to_pint(data + pos, len - pos, base, &last_pos);
365
55.5k
    if (r < 0) return r;
366
367
    // Move after the last digit
368
51.6k
    pos += last_pos;
369
370
    // Ignore LWS after
371
52.5k
    while (pos < len) {
372
3.51k
        if (!htp_is_lws(data[pos])) {
373
2.65k
            return -1002;
374
2.65k
        }
375
376
860
        pos++;
377
860
    }
378
379
48.9k
    return r;
380
51.6k
}
381
382
#ifdef HTP_DEBUG
383
384
/**
385
 * Prints one log message to stderr.
386
 *
387
 * @param[in] stream
388
 * @param[in] log
389
 */
390
void htp_print_log(FILE *stream, htp_log_t *log) {
391
    if (log->code != 0) {
392
        fprintf(stream, "[%d][code %d][file %s][line %d] %s\n", log->level,
393
                log->code, log->file, log->line, log->msg);
394
    } else {
395
        fprintf(stream, "[%d][file %s][line %d] %s\n", log->level,
396
                log->file, log->line, log->msg);
397
    }
398
}
399
#endif
400
401
/**
402
 * Records one log message.
403
 * 
404
 * @param[in] connp
405
 * @param[in] file
406
 * @param[in] line
407
 * @param[in] level
408
 * @param[in] code
409
 * @param[in] fmt
410
 */
411
3.27M
void htp_log(htp_connp_t *connp, const char *file, int line, enum htp_log_level_t level, int code, const char *fmt, ...) {
412
3.27M
    if (connp == NULL) return;
413
414
3.27M
    char buf[1024];
415
3.27M
    va_list args;
416
417
    // Ignore messages below our log level.
418
3.27M
    if (connp->cfg->log_level < level) {
419
0
        return;
420
0
    }
421
422
3.27M
    va_start(args, fmt);
423
424
3.27M
    int r = vsnprintf(buf, 1024, fmt, args);
425
426
3.27M
    va_end(args);
427
428
3.27M
    if (r < 0) {
429
0
        snprintf(buf, 1024, "[vnsprintf returned error %d]", r);
430
3.27M
    } else if (r >= 1024) {
431
        // Indicate overflow with a '+' at the end.
432
0
        buf[1022] = '+';
433
0
        buf[1023] = '\0';
434
0
    }
435
436
    // Create a new log entry.
437
438
3.27M
    htp_log_t *log = calloc(1, sizeof (htp_log_t));
439
3.27M
    if (log == NULL) return;
440
441
3.27M
    log->connp = connp;
442
3.27M
    log->file = file;
443
3.27M
    log->line = line;
444
3.27M
    log->level = level;
445
3.27M
    log->code = code;
446
3.27M
    log->msg = strdup(buf);
447
448
3.27M
    if (htp_list_add(connp->conn->messages, log) != HTP_OK) {
449
0
        free((void *) log->msg);
450
0
        free(log);
451
0
        return;
452
0
    }
453
454
3.27M
    if (level == HTP_LOG_ERROR) {
455
1.57M
        connp->last_error = log;
456
1.57M
    }
457
458
    #ifdef HTP_DEBUG
459
    fprintf(stderr, "[LOG] %s\n", log->msg);
460
    #endif
461
462
    /* coverity[check_return] */
463
3.27M
    htp_hook_run_all(connp->cfg->hook_log, log);
464
3.27M
}
465
466
/**
467
 * Determines if the given line is a continuation (of some previous line).
468
 * 
469
 * @param[in] data
470
 * @param[in] len
471
 * @return 0 or 1 for false and true, respectively. Returns -1 on error (NULL pointer or length zero).
472
 */
473
718k
int htp_connp_is_line_folded(unsigned char *data, size_t len) {
474
718k
    if ((data == NULL) || (len == 0)) return -1;
475
635k
    return htp_is_folding_char(data[0]);
476
718k
}
477
478
1.15M
int htp_is_folding_char(int c) {
479
1.15M
    if (htp_is_lws(c) || c == 0) return 1;
480
1.00M
    else return 0;
481
1.15M
}
482
483
/**
484
 * Determines if the given line is a request terminator.
485
 *
486
 * @param[in] connp
487
 * @param[in] data
488
 * @param[in] len
489
 * @return 0 or 1
490
 */
491
1.25M
int htp_connp_is_line_terminator(htp_connp_t *connp, unsigned char *data, size_t len, int next_no_lf) {
492
    // Is this the end of request headers?
493
1.25M
    switch (connp->cfg->server_personality) {
494
0
        case HTP_SERVER_IIS_5_1:
495
            // IIS 5 will accept a whitespace line as a terminator
496
0
            if (htp_is_line_whitespace(data, len)) {
497
0
                return 1;
498
0
            }
499
500
            // Fall through
501
1.25M
        default:
502
            // Treat an empty line as terminator
503
1.25M
            if (htp_is_line_empty(data, len)) {
504
258k
                return 1;
505
258k
            }
506
            // Only space is terminator if terminator does not follow right away
507
996k
            if (len == 2 && htp_is_lws(data[0]) && data[1] == LF) {
508
5.27k
                return next_no_lf;
509
5.27k
            }
510
991k
            break;
511
1.25M
    }
512
513
991k
    return 0;
514
1.25M
}
515
516
/**
517
 * Determines if the given line can be ignored when it appears before a request.
518
 *
519
 * @param[in] connp
520
 * @param[in] data
521
 * @param[in] len
522
 * @return 0 or 1
523
 */
524
429k
int htp_connp_is_line_ignorable(htp_connp_t *connp, unsigned char *data, size_t len) {
525
429k
    return htp_connp_is_line_terminator(connp, data, len, 0);
526
429k
}
527
528
4.40k
static htp_status_t htp_parse_port(unsigned char *data, size_t len, int *port, int *invalid) {
529
4.40k
    if (len == 0) {
530
1.28k
        *port = -1;
531
1.28k
        *invalid = 1;
532
1.28k
        return HTP_OK;
533
1.28k
    }
534
535
3.11k
    int64_t port_parsed = htp_parse_positive_integer_whitespace(data, len, 10);
536
537
3.11k
    if (port_parsed < 0) {
538
        // Failed to parse the port number.
539
1.24k
        *port = -1;
540
1.24k
        *invalid = 1;
541
1.87k
    } else if ((port_parsed > 0) && (port_parsed < 65536)) {
542
        // Valid port number.
543
1.48k
        *port = (int) port_parsed;
544
1.48k
    } else {
545
        // Port number out of range.
546
384
        *port = -1;
547
384
        *invalid = 1;
548
384
    }
549
550
3.11k
    return HTP_OK;
551
4.40k
}
552
553
/**
554
 * Parses an authority string, which consists of a hostname with an optional port number; username
555
 * and password are not allowed and will not be handled.
556
 *
557
 * @param[in] hostport
558
 * @param[out] hostname A bstring containing the hostname, or NULL if the hostname is invalid. If this value
559
 *                      is not NULL, the caller assumes responsibility for memory management.
560
 * @param[out] port Port as text, or NULL if not provided.
561
 * @param[out] port_number Port number, or -1 if the port is not present or invalid.
562
 * @param[out] invalid Set to 1 if any part of the authority is invalid.
563
 * @return HTP_OK on success, HTP_ERROR on memory allocation failure.
564
 */
565
8.66k
htp_status_t htp_parse_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, int *invalid) {
566
8.66k
    if ((hostport == NULL) || (hostname == NULL) || (port_number == NULL) || (invalid == NULL)) return HTP_ERROR;
567
568
8.66k
    *hostname = NULL;
569
8.66k
    if (port != NULL) {
570
3.33k
        *port = NULL;
571
3.33k
    }
572
8.66k
    *port_number = -1;
573
8.66k
    *invalid = 0;
574
575
8.66k
    unsigned char *data = bstr_ptr(hostport);
576
8.66k
    size_t len = bstr_len(hostport);
577
578
8.66k
    bstr_util_mem_trim(&data, &len);
579
580
8.66k
    if (len == 0) {
581
394
        *invalid = 1;
582
394
        return HTP_OK;
583
394
    }
584
585
    // Check for an IPv6 address.
586
8.27k
    if (data[0] == '[') {
587
        // IPv6 host.
588
589
        // Find the end of the IPv6 address.
590
1.60k
        size_t pos = 0;
591
8.72k
        while ((pos < len) && (data[pos] != ']')) pos++;
592
1.60k
        if (pos == len) {
593
300
            *invalid = 1;
594
300
            return HTP_OK;
595
300
        }
596
597
1.30k
        *hostname = bstr_dup_mem(data, pos + 1);
598
1.30k
        if (*hostname == NULL) return HTP_ERROR;
599
600
        // Over the ']'.
601
1.30k
        pos++;
602
1.30k
        if (pos == len) return HTP_OK;
603
604
        // Handle port.
605
1.25k
        if (data[pos] == ':') {
606
1.15k
            if (port != NULL) {
607
75
                *port = bstr_dup_mem(data + pos + 1, len - pos - 1);
608
75
                if (*port == NULL) {
609
0
                    bstr_free(*hostname);
610
0
                    return HTP_ERROR;
611
0
                }
612
75
            }
613
614
1.15k
            return htp_parse_port(data + pos + 1, len - pos - 1, port_number, invalid);
615
1.15k
        } else {
616
105
            *invalid = 1;
617
105
            return HTP_OK;
618
105
        }
619
6.66k
    } else {
620
        // Not IPv6 host.
621
622
        // Is there a colon?
623
6.66k
        unsigned char *colon = memchr(data, ':', len);
624
6.66k
        if (colon == NULL) {
625
            // Hostname alone, no port.
626
627
3.41k
            *hostname = bstr_dup_mem(data, len);
628
3.41k
            if (*hostname == NULL) return HTP_ERROR;
629
630
3.41k
            bstr_to_lowercase(*hostname);
631
3.41k
        } else {
632
            // Hostname and port.
633
634
            // Ignore whitespace at the end of hostname.
635
3.25k
            unsigned char *hostend = colon;
636
3.47k
            while ((hostend > data) && (isspace(*(hostend - 1)))) hostend--;
637
638
3.25k
            *hostname = bstr_dup_mem(data, hostend - data);
639
3.25k
            if (*hostname == NULL) return HTP_ERROR;
640
641
3.25k
            if (port != NULL) {
642
619
                *port = bstr_dup_mem(colon + 1, len - (colon + 1 - data));
643
619
                if (*port == NULL) {
644
0
                    bstr_free(*hostname);
645
0
                    return HTP_ERROR;
646
0
                }
647
619
            }
648
649
3.25k
            return htp_parse_port(colon + 1, len - (colon + 1 - data), port_number, invalid);
650
3.25k
        }
651
6.66k
    }
652
653
3.41k
    return HTP_OK;
654
8.27k
}
655
656
/**
657
 * Parses hostport provided in the URI.
658
 *
659
 * @param[in] connp
660
 * @param[in] hostport
661
 * @param[in] uri
662
 * @return HTP_OK on success or HTP_ERROR error.
663
 */
664
3.34k
int htp_parse_uri_hostport(htp_connp_t *connp, bstr *hostport, htp_uri_t *uri) {
665
3.34k
    int invalid;
666
667
3.34k
    htp_status_t rc = htp_parse_hostport(hostport, &(uri->hostname), &(uri->port), &(uri->port_number), &invalid);
668
3.34k
    if (rc != HTP_OK) return rc;
669
670
3.33k
    if (invalid) {
671
706
        connp->in_tx->flags |= HTP_HOSTU_INVALID;
672
706
    }
673
674
3.33k
    if (uri->hostname != NULL) {
675
3.31k
        if (htp_validate_hostname(uri->hostname) == 0) {
676
2.15k
            connp->in_tx->flags |= HTP_HOSTU_INVALID;
677
2.15k
        }
678
3.31k
    }
679
680
3.33k
    return HTP_OK;
681
3.34k
}
682
683
/**
684
 * Parses hostport provided in the Host header.
685
 * 
686
 * @param[in] hostport
687
 * @param[out] hostname
688
 * @param[out] port
689
 * @param[out] port_number
690
 * @param[out] flags
691
 * @return HTP_OK on success or HTP_ERROR error.
692
 */
693
5.32k
htp_status_t htp_parse_header_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, uint64_t *flags) {
694
5.32k
    int invalid;
695
696
5.32k
    htp_status_t rc = htp_parse_hostport(hostport, hostname, port, port_number, &invalid);
697
5.32k
    if (rc != HTP_OK) return rc;
698
699
5.32k
    if (invalid) {
700
3.00k
        *flags |= HTP_HOSTH_INVALID;
701
3.00k
    }
702
703
5.32k
    if (*hostname != NULL) {
704
4.65k
        if (htp_validate_hostname(*hostname) == 0) {
705
4.30k
            *flags |= HTP_HOSTH_INVALID;
706
4.30k
        }
707
4.65k
    }
708
709
5.32k
    return HTP_OK;
710
5.32k
}
711
712
/**
713
 * Parses request URI, making no attempt to validate the contents.
714
 * 
715
 * @param[in] input
716
 * @param[in] uri
717
 * @return HTP_ERROR on memory allocation failure, HTP_OK otherwise
718
 */
719
61.1k
int htp_parse_uri(bstr *input, htp_uri_t **uri) {
720
    // Allow a htp_uri_t structure to be provided on input,
721
    // but allocate a new one if the structure is NULL.
722
61.1k
    if (*uri == NULL) {
723
0
        *uri = calloc(1, sizeof (htp_uri_t));
724
0
        if (*uri == NULL) return HTP_ERROR;
725
0
    }
726
727
61.1k
    if (input == NULL) {
728
        // The input might be NULL on requests that don't actually
729
        // contain the URI. We allow that.
730
27.5k
        return HTP_OK;
731
27.5k
    }
732
733
33.5k
    unsigned char *data = bstr_ptr(input);
734
33.5k
    size_t len = bstr_len(input);
735
    // remove trailing spaces
736
33.5k
    while (len > 0) {
737
33.5k
        if (data[len-1] != ' ') {
738
33.5k
            break;
739
33.5k
        }
740
0
        len--;
741
0
    }
742
33.5k
    size_t start, pos;
743
744
33.5k
    if (len == 0) {
745
        // Empty string.
746
0
        return HTP_OK;
747
0
    }
748
749
33.5k
    pos = 0;
750
751
    // Scheme test: if it doesn't start with a forward slash character (which it must
752
    // for the contents to be a path or an authority, then it must be the scheme part
753
33.5k
    if (data[0] != '/') {
754
        // Parse scheme        
755
756
        // Find the colon, which marks the end of the scheme part
757
31.4k
        start = pos;
758
13.4M
        while ((pos < len) && (data[pos] != ':')) pos++;
759
760
31.4k
        if (pos >= len) {
761
            // We haven't found a colon, which means that the URI
762
            // is invalid. Apache will ignore this problem and assume
763
            // the URI contains an invalid path so, for the time being,
764
            // we are going to do the same.
765
10.7k
            pos = 0;
766
20.7k
        } else {
767
            // Make a copy of the scheme
768
20.7k
            (*uri)->scheme = bstr_dup_mem(data + start, pos - start);
769
20.7k
            if ((*uri)->scheme == NULL) return HTP_ERROR;
770
771
            // Go over the colon
772
20.7k
            pos++;
773
20.7k
        }
774
31.4k
    }
775
776
    // Authority test: two forward slash characters and it's an authority.
777
    // One, three or more slash characters, and it's a path. We, however,
778
    // only attempt to parse authority if we've seen a scheme.
779
33.5k
    if ((*uri)->scheme != NULL)
780
20.7k
        if ((pos + 2 < len) && (data[pos] == '/') && (data[pos + 1] == '/') && (data[pos + 2] != '/')) {
781
            // Parse authority
782
783
            // Go over the two slash characters
784
16.4k
            start = pos = pos + 2;
785
786
            // Authority ends with a question mark, forward slash or hash
787
2.79M
            while ((pos < len) && (data[pos] != '?') && (data[pos] != '/') && (data[pos] != '#')) pos++;
788
789
16.4k
            unsigned char *hostname_start;
790
16.4k
            size_t hostname_len;
791
792
            // Are the credentials included in the authority?
793
16.4k
            unsigned char *m = memchr(data + start, '@', pos - start);
794
16.4k
            if (m != NULL) {
795
                // Credentials present
796
2.35k
                unsigned char *credentials_start = data + start;
797
2.35k
                size_t credentials_len = m - data - start;
798
799
                // Figure out just the hostname part
800
2.35k
                hostname_start = data + start + credentials_len + 1;
801
2.35k
                hostname_len = pos - start - credentials_len - 1;
802
803
                // Extract the username and the password
804
2.35k
                m = memchr(credentials_start, ':', credentials_len);
805
2.35k
                if (m != NULL) {
806
                    // Username and password
807
1.21k
                    (*uri)->username = bstr_dup_mem(credentials_start, m - credentials_start);
808
1.21k
                    if ((*uri)->username == NULL) return HTP_ERROR;
809
1.21k
                    (*uri)->password = bstr_dup_mem(m + 1, credentials_len - (m - credentials_start) - 1);
810
1.21k
                    if ((*uri)->password == NULL) return HTP_ERROR;
811
1.21k
                } else {
812
                    // Username alone
813
1.14k
                    (*uri)->username = bstr_dup_mem(credentials_start, credentials_len);
814
1.14k
                    if ((*uri)->username == NULL) return HTP_ERROR;
815
1.14k
                }
816
14.0k
            } else {
817
                // No credentials
818
14.0k
                hostname_start = data + start;
819
14.0k
                hostname_len = pos - start;
820
14.0k
            }
821
822
            // Parsing authority without credentials.
823
16.4k
            if ((hostname_len > 0) && (hostname_start[0] == '[')) {
824
                // IPv6 address.
825
826
2.31k
                m = memchr(hostname_start, ']', hostname_len);
827
2.31k
                if (m == NULL) {
828
                    // Invalid IPv6 address; use the entire string as hostname.
829
1.29k
                    (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
830
1.29k
                    if ((*uri)->hostname == NULL) return HTP_ERROR;
831
1.29k
                } else {
832
1.02k
                    (*uri)->hostname = bstr_dup_mem(hostname_start, m - hostname_start + 1);
833
1.02k
                    if ((*uri)->hostname == NULL) return HTP_ERROR;
834
835
                    // Is there a port?
836
1.02k
                    hostname_len = hostname_len - (m - hostname_start + 1);
837
1.02k
                    hostname_start = m + 1;
838
839
                    // Port string
840
1.02k
                    m = memchr(hostname_start, ':', hostname_len);
841
1.02k
                    if (m != NULL) {
842
368
                        size_t port_len = hostname_len - (m - hostname_start) - 1;
843
368
                        (*uri)->port = bstr_dup_mem(m + 1, port_len);
844
368
                        if ((*uri)->port == NULL) return HTP_ERROR;
845
368
                    }
846
1.02k
                }
847
14.1k
            } else {
848
                // Not IPv6 address.
849
850
14.1k
                m = memchr(hostname_start, ':', hostname_len);
851
14.1k
                if (m != NULL) {
852
7.79k
                    size_t port_len = hostname_len - (m - hostname_start) - 1;
853
7.79k
                    hostname_len = hostname_len - port_len - 1;
854
855
                    // Port string
856
7.79k
                    (*uri)->port = bstr_dup_mem(m + 1, port_len);
857
7.79k
                    if ((*uri)->port == NULL) return HTP_ERROR;
858
7.79k
                }
859
860
                // Hostname
861
14.1k
                (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
862
14.1k
                if ((*uri)->hostname == NULL) return HTP_ERROR;
863
14.1k
            }
864
16.4k
        }
865
866
    // Path
867
33.5k
    start = pos;
868
869
    // The path part will end with a question mark or a hash character, which
870
    // mark the beginning of the query part or the fragment part, respectively.
871
14.9M
    while ((pos < len) && (data[pos] != '?') && (data[pos] != '#')) pos++;
872
873
    // Path
874
33.5k
    (*uri)->path = bstr_dup_mem(data + start, pos - start);
875
33.5k
    if ((*uri)->path == NULL) return HTP_ERROR;
876
877
33.5k
    if (pos == len) return HTP_OK;
878
879
    // Query
880
3.31k
    if (data[pos] == '?') {
881
        // Step over the question mark
882
1.58k
        start = pos + 1;
883
884
        // The query part will end with the end of the input
885
        // or the beginning of the fragment part
886
565k
        while ((pos < len) && (data[pos] != '#')) pos++;
887
888
        // Query string
889
1.58k
        (*uri)->query = bstr_dup_mem(data + start, pos - start);
890
1.58k
        if ((*uri)->query == NULL) return HTP_ERROR;
891
892
1.58k
        if (pos == len) return HTP_OK;
893
1.58k
    }
894
895
    // Fragment
896
1.96k
    if (data[pos] == '#') {
897
        // Step over the hash character
898
1.96k
        start = pos + 1;
899
900
        // Fragment; ends with the end of the input
901
1.96k
        (*uri)->fragment = bstr_dup_mem(data + start, len - start);
902
1.96k
        if ((*uri)->fragment == NULL) return HTP_ERROR;
903
1.96k
    }
904
905
1.96k
    return HTP_OK;
906
1.96k
}
907
908
/**
909
 * Convert two input bytes, pointed to by the pointer parameter,
910
 * into a single byte by assuming the input consists of hexadecimal
911
 * characters. This function will happily convert invalid input.
912
 *
913
 * @param[in] what
914
 * @return hex-decoded byte
915
 */
916
588k
static unsigned char x2c(unsigned char *what) {
917
588k
    register unsigned char digit;
918
919
588k
    digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
920
588k
    digit *= 16;
921
588k
    digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
922
923
588k
    return digit;
924
588k
}
925
926
/**
927
 * Convert a Unicode codepoint into a single-byte, using best-fit
928
 * mapping (as specified in the provided configuration structure).
929
 *
930
 * @param[in] cfg
931
 * @param[in] codepoint
932
 * @return converted single byte
933
 */
934
86.1k
static uint8_t bestfit_codepoint(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, uint32_t codepoint) {
935
    // Is it a single-byte codepoint?
936
86.1k
    if (codepoint < 0x100) {
937
1.37k
        return (uint8_t) codepoint;
938
1.37k
    }
939
940
    // Our current implementation converts only the 2-byte codepoints.
941
84.7k
    if (codepoint > 0xffff) {
942
852
        return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
943
852
    }
944
945
83.9k
    uint8_t *p = cfg->decoder_cfgs[ctx].bestfit_map;
946
947
    // TODO Optimize lookup.
948
949
32.6M
    for (;;) {
950
32.6M
        uint32_t x = (p[0] << 8) + p[1];
951
952
32.6M
        if (x == 0) {
953
83.0k
            return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
954
83.0k
        }
955
956
32.5M
        if (x == codepoint) {
957
860
            return p[2];
958
860
        }
959
960
        // Move to the next triplet
961
32.5M
        p += 3;
962
32.5M
    }
963
83.9k
}
964
965
/**
966
 * Decode a UTF-8 encoded path. Overlong characters will be decoded, invalid
967
 * characters will be left as-is. Best-fit mapping will be used to convert
968
 * UTF-8 into a single-byte stream.
969
 *
970
 * @param[in] cfg
971
 * @param[in] tx
972
 * @param[in] path
973
 */
974
33.5k
void htp_utf8_decode_path_inplace(htp_cfg_t *cfg, htp_tx_t *tx, bstr *path) {
975
33.5k
    if (path == NULL) return;
976
977
33.5k
    uint8_t *data = bstr_ptr(path);
978
33.5k
    if (data == NULL) return;
979
980
33.5k
    size_t len = bstr_len(path);
981
33.5k
    size_t rpos = 0;
982
33.5k
    size_t wpos = 0;
983
33.5k
    uint32_t codepoint = 0;
984
33.5k
    uint32_t state = HTP_UTF8_ACCEPT;
985
33.5k
    uint32_t counter = 0;
986
33.5k
    uint8_t seen_valid = 0;
987
988
16.9M
    while ((rpos < len)&&(wpos < len)) {
989
16.8M
        counter++;
990
991
16.8M
        switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
992
8.36M
            case HTP_UTF8_ACCEPT:
993
8.36M
                if (counter == 1) {
994
                    // ASCII character, which we just copy.
995
8.27M
                    data[wpos++] = (uint8_t) codepoint;
996
8.27M
                } else {
997
                    // A valid UTF-8 character, which we need to convert.
998
999
86.1k
                    seen_valid = 1;
1000
1001
                    // Check for overlong characters and set the flag accordingly.
1002
86.1k
                    switch (counter) {
1003
82.8k
                        case 2:
1004
82.8k
                            if (codepoint < 0x80) {
1005
369
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1006
369
                            }
1007
82.8k
                            break;
1008
1.80k
                        case 3:
1009
1.80k
                            if (codepoint < 0x800) {
1010
382
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1011
382
                            }
1012
1.80k
                            break;
1013
1.42k
                        case 4:
1014
1.42k
                            if (codepoint < 0x10000) {
1015
571
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1016
571
                            }
1017
1.42k
                            break;
1018
86.1k
                    }
1019
1020
                    // Special flag for half-width/full-width evasion.
1021
86.1k
                    if ((codepoint >= 0xff00) && (codepoint <= 0xffef)) {
1022
505
                        tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1023
505
                    }
1024
1025
                    // Use best-fit mapping to convert to a single byte.
1026
86.1k
                    data[wpos++] = bestfit_codepoint(cfg, HTP_DECODER_URL_PATH, codepoint);
1027
86.1k
                }
1028
1029
                // Advance over the consumed byte and reset the byte counter.
1030
8.36M
                rpos++;
1031
8.36M
                counter = 0;
1032
1033
8.36M
                break;
1034
1035
5.80M
            case HTP_UTF8_REJECT:
1036
                // Invalid UTF-8 character.
1037
1038
5.80M
                tx->flags |= HTP_PATH_UTF8_INVALID;
1039
1040
                // Is the server expected to respond with 400?
1041
5.80M
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1042
0
                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted;
1043
0
                }
1044
1045
                // Output the replacement byte, replacing one or more invalid bytes.
1046
5.80M
                data[wpos++] = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1047
1048
                // If the invalid byte was first in a sequence, consume it. Otherwise,
1049
                // assume it's the starting byte of the next character.
1050
5.80M
                if (counter == 1) {
1051
3.16M
                    rpos++;
1052
3.16M
                }
1053
1054
                // Reset the decoder state and continue decoding.
1055
5.80M
                state = HTP_UTF8_ACCEPT;
1056
5.80M
                codepoint = 0;
1057
5.80M
                counter = 0;
1058
1059
5.80M
                break;
1060
1061
2.73M
            default:
1062
                // Keep going; the character is not yet formed.
1063
2.73M
                rpos++;
1064
2.73M
                break;
1065
16.8M
        }
1066
16.8M
    }
1067
1068
    // Did the input stream seem like a valid UTF-8 string?
1069
33.5k
    if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1070
842
        tx->flags |= HTP_PATH_UTF8_VALID;
1071
842
    }
1072
1073
    // Adjust the length of the string, because
1074
    // we're doing in-place decoding.
1075
33.5k
    bstr_adjust_len(path, wpos);
1076
33.5k
}
1077
1078
/**
1079
 * Validate a path that is quite possibly UTF-8 encoded.
1080
 * 
1081
 * @param[in] tx
1082
 * @param[in] path
1083
 */
1084
0
void htp_utf8_validate_path(htp_tx_t *tx, bstr *path) {
1085
0
    unsigned char *data = bstr_ptr(path);
1086
0
    size_t len = bstr_len(path);
1087
0
    size_t rpos = 0;
1088
0
    uint32_t codepoint = 0;
1089
0
    uint32_t state = HTP_UTF8_ACCEPT;
1090
0
    uint32_t counter = 0; // How many bytes used by a UTF-8 character.
1091
0
    uint8_t seen_valid = 0;
1092
1093
0
    while (rpos < len) {
1094
0
        counter++;
1095
1096
0
        switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
1097
0
            case HTP_UTF8_ACCEPT:
1098
                // We have a valid character.
1099
1100
0
                if (counter > 1) {
1101
                    // A valid UTF-8 character, consisting of 2 or more bytes.
1102
1103
0
                    seen_valid = 1;
1104
1105
                    // Check for overlong characters and set the flag accordingly.
1106
0
                    switch (counter) {
1107
0
                        case 2:
1108
0
                            if (codepoint < 0x80) {
1109
0
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1110
0
                            }
1111
0
                            break;
1112
0
                        case 3:
1113
0
                            if (codepoint < 0x800) {
1114
0
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1115
0
                            }
1116
0
                            break;
1117
0
                        case 4:
1118
0
                            if (codepoint < 0x10000) {
1119
0
                                tx->flags |= HTP_PATH_UTF8_OVERLONG;
1120
0
                            }
1121
0
                            break;
1122
0
                    }
1123
0
                }
1124
1125
                // Special flag for half-width/full-width evasion.
1126
0
                if ((codepoint > 0xfeff) && (codepoint < 0x010000)) {
1127
0
                    tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1128
0
                }
1129
1130
                // Advance over the consumed byte and reset the byte counter.
1131
0
                rpos++;
1132
0
                counter = 0;
1133
1134
0
                break;
1135
1136
0
            case HTP_UTF8_REJECT:
1137
                // Invalid UTF-8 character.
1138
1139
0
                tx->flags |= HTP_PATH_UTF8_INVALID;
1140
1141
                // Override the decoder state because we want to continue decoding.
1142
0
                state = HTP_UTF8_ACCEPT;
1143
1144
                // Advance over the consumed byte and reset the byte counter.
1145
0
                rpos++;
1146
0
                counter = 0;
1147
1148
0
                break;
1149
1150
0
            default:
1151
                // Keep going; the character is not yet formed.
1152
0
                rpos++;
1153
0
                break;
1154
0
        }
1155
0
    }
1156
1157
    // Did the input stream seem like a valid UTF-8 string?
1158
0
    if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1159
0
        tx->flags |= HTP_PATH_UTF8_VALID;
1160
0
    }
1161
0
}
1162
1163
/**
1164
 * Decode a %u-encoded character, using best-fit mapping as necessary. Path version.
1165
 *
1166
 * @param[in] cfg
1167
 * @param[in] tx
1168
 * @param[in] data
1169
 * @return decoded byte
1170
 */
1171
124k
static uint8_t decode_u_encoding_path(htp_cfg_t *cfg, htp_tx_t *tx, unsigned char *data) {
1172
124k
    uint8_t c1 = x2c(data);
1173
124k
    uint8_t c2 = x2c(data + 2);
1174
124k
    uint8_t r = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1175
1176
124k
    if (c1 == 0x00) {
1177
368
        r = c2;
1178
368
        tx->flags |= HTP_PATH_OVERLONG_U;
1179
124k
    } else {
1180
        // Check for fullwidth form evasion
1181
124k
        if (c1 == 0xff) {
1182
92.6k
            tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1183
92.6k
        }
1184
1185
124k
        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1186
0
            tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1187
0
        }
1188
1189
        // Use best-fit mapping
1190
124k
        unsigned char *p = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_map;
1191
1192
        // TODO Optimize lookup.
1193
1194
44.4M
        for (;;) {
1195
            // Have we reached the end of the map?
1196
44.4M
            if ((p[0] == 0) && (p[1] == 0)) {
1197
61.7k
                break;
1198
61.7k
            }
1199
1200
            // Have we found the mapping we're looking for?
1201
44.3M
            if ((p[0] == c1) && (p[1] == c2)) {
1202
62.5k
                r = p[2];
1203
62.5k
                break;
1204
62.5k
            }
1205
1206
            // Move to the next triplet
1207
44.3M
            p += 3;
1208
44.3M
        }
1209
124k
    }
1210
1211
    // Check for encoded path separators
1212
124k
    if ((r == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (r == '\\'))) {
1213
1.71k
        tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1214
1.71k
    }
1215
1216
124k
    return r;
1217
124k
}
1218
1219
/**
1220
 * Decode a %u-encoded character, using best-fit mapping as necessary. Params version.
1221
 *
1222
 * @param[in] cfg
1223
 * @param[in] tx
1224
 * @param[in] data
1225
 * @return decoded byte
1226
 */
1227
166k
static uint8_t decode_u_encoding_params(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, unsigned char *data, uint64_t *flags) {
1228
166k
    uint8_t c1 = x2c(data);
1229
166k
    uint8_t c2 = x2c(data + 2);
1230
1231
    // Check for overlong usage first.
1232
166k
    if (c1 == 0) {
1233
16.3k
        (*flags) |= HTP_URLEN_OVERLONG_U;
1234
16.3k
        return c2;
1235
16.3k
    }
1236
1237
    // Both bytes were used.
1238
1239
    // Detect half-width and full-width range.
1240
150k
    if ((c1 == 0xff) && (c2 <= 0xef)) {
1241
82.5k
        (*flags) |= HTP_URLEN_HALF_FULL_RANGE;
1242
82.5k
    }
1243
1244
    // Use best-fit mapping.
1245
150k
    unsigned char *p = cfg->decoder_cfgs[ctx].bestfit_map;
1246
150k
    uint8_t r = cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
1247
1248
    // TODO Optimize lookup.
1249
1250
54.5M
    for (;;) {
1251
        // Have we reached the end of the map?
1252
54.5M
        if ((p[0] == 0) && (p[1] == 0)) {
1253
90.5k
            break;
1254
90.5k
        }
1255
1256
        // Have we found the mapping we're looking for?
1257
54.4M
        if ((p[0] == c1) && (p[1] == c2)) {
1258
59.4k
            r = p[2];
1259
59.4k
            break;
1260
59.4k
        }
1261
1262
        // Move to the next triplet
1263
54.3M
        p += 3;
1264
54.3M
    }
1265
1266
150k
    return r;
1267
166k
}
1268
1269
/**
1270
 * Decode a request path according to the settings in the
1271
 * provided configuration structure.
1272
 *
1273
 * @param[in] cfg
1274
 * @param[in] tx
1275
 * @param[in] path
1276
 */
1277
33.5k
htp_status_t htp_decode_path_inplace(htp_tx_t *tx, bstr *path) {
1278
33.5k
    if (path == NULL) return HTP_ERROR;
1279
33.5k
    unsigned char *data = bstr_ptr(path);
1280
33.5k
    if (data == NULL) return HTP_ERROR;
1281
1282
33.5k
    size_t len = bstr_len(path);
1283
1284
33.5k
    htp_cfg_t *cfg = tx->cfg;
1285
1286
33.5k
    size_t rpos = 0;
1287
33.5k
    size_t wpos = 0;
1288
33.5k
    int previous_was_separator = 0;
1289
1290
14.2M
    while ((rpos < len) && (wpos < len)) {
1291
14.2M
        uint8_t c = data[rpos];
1292
1293
        // Decode encoded characters
1294
14.2M
        if (c == '%') {
1295
142k
            if (rpos + 2 < len) {
1296
141k
                int handled = 0;
1297
1298
141k
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_decode) {
1299
                    // Check for the %u encoding
1300
141k
                    if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1301
133k
                        handled = 1;
1302
1303
133k
                        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1304
0
                            tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1305
0
                        }
1306
1307
133k
                        if (rpos + 5 < len) {
1308
132k
                            if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1309
132k
                                    && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1310
                                // Decode a valid %u encoding
1311
124k
                                c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1312
124k
                                rpos += 6;
1313
1314
124k
                                if (c == 0) {
1315
347
                                    tx->flags |= HTP_PATH_ENCODED_NUL;
1316
1317
347
                                    if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1318
0
                                        tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1319
0
                                    }
1320
347
                                }
1321
124k
                            } else {
1322
                                // Invalid %u encoding
1323
8.24k
                                tx->flags |= HTP_PATH_INVALID_ENCODING;
1324
1325
8.24k
                                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1326
0
                                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1327
0
                                }
1328
1329
8.24k
                                switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1330
0
                                    case HTP_URL_DECODE_REMOVE_PERCENT:
1331
                                        // Do not place anything in output; eat
1332
                                        // the percent character
1333
0
                                        rpos++;
1334
0
                                        continue;
1335
0
                                        break;
1336
8.24k
                                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1337
                                        // Leave the percent character in output
1338
8.24k
                                        rpos++;
1339
8.24k
                                        break;
1340
0
                                    case HTP_URL_DECODE_PROCESS_INVALID:
1341
                                        // Decode invalid %u encoding
1342
0
                                        c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1343
0
                                        rpos += 6;
1344
0
                                        break;
1345
8.24k
                                }
1346
8.24k
                            }
1347
132k
                        } else {
1348
                            // Invalid %u encoding (not enough data)
1349
307
                            tx->flags |= HTP_PATH_INVALID_ENCODING;
1350
1351
307
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1352
0
                                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1353
0
                            }
1354
1355
307
                            switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1356
0
                                case HTP_URL_DECODE_REMOVE_PERCENT:
1357
                                    // Do not place anything in output; eat
1358
                                    // the percent character
1359
0
                                    rpos++;
1360
0
                                    continue;
1361
0
                                    break;
1362
307
                                case HTP_URL_DECODE_PRESERVE_PERCENT:
1363
                                    // Leave the percent character in output
1364
307
                                    rpos++;
1365
307
                                    break;
1366
0
                                case HTP_URL_DECODE_PROCESS_INVALID:
1367
                                    // Cannot decode, because there's not enough data.
1368
                                    // Leave the percent character in output
1369
0
                                    rpos++;
1370
                                    // TODO Configurable handling.
1371
0
                                    break;
1372
307
                            }
1373
307
                        }
1374
133k
                    }
1375
141k
                }
1376
1377
                // Handle standard URL encoding
1378
141k
                if (!handled) {
1379
8.17k
                    if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1380
1.71k
                        c = x2c(&data[rpos + 1]);
1381
1382
1.71k
                        if (c == 0) {
1383
337
                            tx->flags |= HTP_PATH_ENCODED_NUL;
1384
1385
337
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1386
0
                                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1387
0
                            }
1388
1389
337
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_terminates) {
1390
0
                                bstr_adjust_len(path, wpos);
1391
0
                                return HTP_OK;
1392
0
                            }
1393
337
                        }
1394
1395
1.71k
                        if ((c == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (c == '\\'))) {
1396
396
                            tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1397
1398
396
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1399
0
                                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted;
1400
0
                            }
1401
1402
396
                            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_decode) {
1403
                                // Decode
1404
396
                                rpos += 3;
1405
396
                            } else {
1406
                                // Leave encoded
1407
0
                                c = '%';
1408
0
                                rpos++;
1409
0
                            }
1410
1.32k
                        } else {
1411
                            // Decode
1412
1.32k
                            rpos += 3;
1413
1.32k
                        }
1414
6.45k
                    } else {
1415
                        // Invalid encoding
1416
6.45k
                        tx->flags |= HTP_PATH_INVALID_ENCODING;
1417
1418
6.45k
                        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1419
0
                            tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1420
0
                        }
1421
1422
6.45k
                        switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1423
0
                            case HTP_URL_DECODE_REMOVE_PERCENT:
1424
                                // Do not place anything in output; eat
1425
                                // the percent character
1426
0
                                rpos++;
1427
0
                                continue;
1428
0
                                break;
1429
6.45k
                            case HTP_URL_DECODE_PRESERVE_PERCENT:
1430
                                // Leave the percent character in output
1431
6.45k
                                rpos++;
1432
6.45k
                                break;
1433
0
                            case HTP_URL_DECODE_PROCESS_INVALID:
1434
                                // Decode
1435
0
                                c = x2c(&data[rpos + 1]);
1436
0
                                rpos += 3;
1437
                                // Note: What if an invalid encoding decodes into a path
1438
                                //       separator? This is theoretical at the moment, because
1439
                                //       the only platform we know doesn't convert separators is
1440
                                //       Apache, who will also respond with 400 if invalid encoding
1441
                                //       is encountered. Thus no check for a separator here.
1442
0
                                break;
1443
0
                            default:
1444
                                // Unknown setting
1445
0
                                return HTP_ERROR;
1446
0
                                break;
1447
6.45k
                        }
1448
6.45k
                    }
1449
8.17k
                }
1450
141k
            } else {
1451
                // Invalid URL encoding (not enough data)
1452
1.35k
                tx->flags |= HTP_PATH_INVALID_ENCODING;
1453
1454
1.35k
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1455
0
                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1456
0
                }
1457
1458
1.35k
                switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1459
0
                    case HTP_URL_DECODE_REMOVE_PERCENT:
1460
                        // Do not place anything in output; eat
1461
                        // the percent character
1462
0
                        rpos++;
1463
0
                        continue;
1464
0
                        break;
1465
1.35k
                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1466
                        // Leave the percent character in output
1467
1.35k
                        rpos++;
1468
1.35k
                        break;
1469
0
                    case HTP_URL_DECODE_PROCESS_INVALID:
1470
                        // Cannot decode, because there's not enough data.
1471
                        // Leave the percent character in output.
1472
                        // TODO Configurable handling.
1473
0
                        rpos++;
1474
0
                        break;
1475
1.35k
                }
1476
1.35k
            }
1477
14.1M
        } else {
1478
            // One non-encoded character
1479
1480
            // Is it a NUL byte?
1481
14.1M
            if (c == 0) {
1482
269k
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1483
0
                    tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted;
1484
0
                }
1485
1486
269k
                if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_terminates) {
1487
                    // Terminate path with a raw NUL byte
1488
0
                    bstr_adjust_len(path, wpos);
1489
0
                    return HTP_OK;
1490
0
                    break;
1491
0
                }
1492
269k
            }
1493
1494
14.1M
            rpos++;
1495
14.1M
        }
1496
1497
        // Place the character into output
1498
1499
        // Check for control characters
1500
14.2M
        if (c < 0x20) {
1501
680k
            if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted != HTP_UNWANTED_IGNORE) {
1502
0
                tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted;
1503
0
            }
1504
680k
        }
1505
1506
        // Convert backslashes to forward slashes, if necessary
1507
14.2M
        if ((c == '\\') && (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes)) {
1508
2.84k
            c = '/';
1509
2.84k
        }
1510
1511
        // Lowercase characters, if necessary
1512
14.2M
        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].convert_lowercase) {
1513
14.2M
            c = (uint8_t) tolower(c);
1514
14.2M
        }
1515
1516
        // If we're compressing separators then we need
1517
        // to track if the previous character was a separator
1518
14.2M
        if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_compress) {
1519
14.2M
            if (c == '/') {
1520
1.26M
                if (!previous_was_separator) {
1521
1.26M
                    data[wpos++] = c;
1522
1.26M
                    previous_was_separator = 1;
1523
1.26M
                } else {
1524
                    // Do nothing; we don't want
1525
                    // another separator in output
1526
7.15k
                }
1527
12.9M
            } else {
1528
12.9M
                data[wpos++] = c;
1529
12.9M
                previous_was_separator = 0;
1530
12.9M
            }
1531
14.2M
        } else {
1532
0
            data[wpos++] = c;
1533
0
        }
1534
14.2M
    }
1535
1536
33.5k
    bstr_adjust_len(path, wpos);
1537
1538
33.5k
    return HTP_OK;
1539
33.5k
}
1540
1541
25.2k
htp_status_t htp_tx_urldecode_uri_inplace(htp_tx_t *tx, bstr *input) {
1542
25.2k
    uint64_t flags = 0;
1543
1544
25.2k
    htp_status_t rc = htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URL_PATH, input, &flags, &(tx->response_status_expected_number));
1545
1546
25.2k
    if (flags & HTP_URLEN_INVALID_ENCODING) {
1547
3.63k
        tx->flags |= HTP_PATH_INVALID_ENCODING;
1548
3.63k
    }
1549
1550
25.2k
    if (flags & HTP_URLEN_ENCODED_NUL) {
1551
1.02k
        tx->flags |= HTP_PATH_ENCODED_NUL;
1552
1.02k
    }
1553
1554
25.2k
    if (flags & HTP_URLEN_RAW_NUL) {
1555
3.67k
        tx->flags |= HTP_PATH_RAW_NUL;
1556
3.67k
    }
1557
1558
25.2k
    return rc;
1559
25.2k
}
1560
1561
0
htp_status_t htp_tx_urldecode_params_inplace(htp_tx_t *tx, bstr *input) {
1562
0
    return htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URLENCODED, input, &(tx->flags), &(tx->response_status_expected_number));
1563
0
}
1564
1565
0
htp_status_t htp_urldecode_inplace(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags) {
1566
0
    int expected_status_code = 0;
1567
0
    return htp_urldecode_inplace_ex(cfg, ctx, input, flags, &expected_status_code);
1568
0
}
1569
1570
25.2k
htp_status_t htp_urldecode_inplace_ex(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags, int *expected_status_code) {
1571
25.2k
    if (input == NULL) return HTP_ERROR;
1572
1573
25.2k
    unsigned char *data = bstr_ptr(input);
1574
25.2k
    if (data == NULL) return HTP_ERROR;
1575
25.2k
    size_t len = bstr_len(input);
1576
1577
25.2k
    size_t rpos = 0;
1578
25.2k
    size_t wpos = 0;
1579
1580
5.30M
    while ((rpos < len) && (wpos < len)) {
1581
5.28M
        uint8_t c = data[rpos];
1582
1583
        // Decode encoded characters.
1584
5.28M
        if (c == '%') {
1585
            // Need at least 2 additional bytes for %HH.
1586
246k
            if (rpos + 2 < len) {
1587
244k
                int handled = 0;
1588
1589
                // Decode %uHHHH encoding, but only if allowed in configuration.
1590
244k
                if (cfg->decoder_cfgs[ctx].u_encoding_decode) {
1591
                    // The next character must be a case-insensitive u.
1592
244k
                    if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1593
212k
                        handled = 1;
1594
1595
212k
                        if (cfg->decoder_cfgs[ctx].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1596
0
                            (*expected_status_code) = cfg->decoder_cfgs[ctx].u_encoding_unwanted;
1597
0
                        }
1598
1599
                        // Need at least 5 additional bytes for %uHHHH.
1600
212k
                        if (rpos + 5 < len) {
1601
211k
                            if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1602
211k
                                    && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1603
                                // Decode a valid %u encoding.
1604
166k
                                c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1605
166k
                                rpos += 6;
1606
166k
                            } else {
1607
                                // Invalid %u encoding (could not find 4 xdigits).
1608
45.4k
                                (*flags) |= HTP_URLEN_INVALID_ENCODING;
1609
1610
45.4k
                                if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1611
0
                                    (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1612
0
                                }
1613
1614
45.4k
                                switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1615
0
                                    case HTP_URL_DECODE_REMOVE_PERCENT:
1616
                                        // Do not place anything in output; consume the %.
1617
0
                                        rpos++;
1618
0
                                        continue;
1619
0
                                        break;
1620
45.4k
                                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1621
                                        // Leave the % in output.
1622
45.4k
                                        rpos++;
1623
45.4k
                                        break;
1624
0
                                    case HTP_URL_DECODE_PROCESS_INVALID:
1625
                                        // Decode invalid %u encoding.
1626
0
                                        c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1627
0
                                        rpos += 6;
1628
0
                                        break;
1629
45.4k
                                }
1630
45.4k
                            }
1631
211k
                        } else {
1632
                            // Invalid %u encoding; not enough data.
1633
765
                            (*flags) |= HTP_URLEN_INVALID_ENCODING;
1634
1635
765
                            if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1636
0
                                (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1637
0
                            }
1638
1639
765
                            switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1640
0
                                case HTP_URL_DECODE_REMOVE_PERCENT:
1641
                                    // Do not place anything in output; consume the %.
1642
0
                                    rpos++;
1643
0
                                    continue;
1644
0
                                    break;
1645
765
                                case HTP_URL_DECODE_PRESERVE_PERCENT:
1646
                                    // Leave the % in output.
1647
765
                                    rpos++;
1648
765
                                    break;
1649
0
                                case HTP_URL_DECODE_PROCESS_INVALID:
1650
                                    // Cannot decode because there's not enough data.
1651
                                    // Leave the % in output.
1652
                                    // TODO Configurable handling of %, u, etc.
1653
0
                                    rpos++;
1654
0
                                    break;
1655
765
                            }
1656
765
                        }
1657
212k
                    }
1658
244k
                }
1659
1660
                // Handle standard URL encoding.
1661
244k
                if (!handled) {
1662
                    // Need 2 hexadecimal digits.
1663
31.4k
                    if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1664
                        // Decode %HH encoding.
1665
4.86k
                        c = x2c(&(data[rpos + 1]));
1666
4.86k
                        rpos += 3;
1667
26.5k
                    } else {
1668
                        // Invalid encoding (enough bytes, but not hexadecimal digits).
1669
26.5k
                        (*flags) |= HTP_URLEN_INVALID_ENCODING;
1670
1671
26.5k
                        if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1672
0
                            (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1673
0
                        }
1674
1675
26.5k
                        switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1676
0
                            case HTP_URL_DECODE_REMOVE_PERCENT:
1677
                                // Do not place anything in output; consume the %.
1678
0
                                rpos++;
1679
0
                                continue;
1680
0
                                break;
1681
26.5k
                            case HTP_URL_DECODE_PRESERVE_PERCENT:
1682
                                // Leave the % in output.
1683
26.5k
                                rpos++;
1684
26.5k
                                break;
1685
0
                            case HTP_URL_DECODE_PROCESS_INVALID:
1686
                                // Decode.
1687
0
                                c = x2c(&(data[rpos + 1]));
1688
0
                                rpos += 3;
1689
0
                                break;
1690
26.5k
                        }
1691
26.5k
                    }
1692
31.4k
                }
1693
244k
            } else {
1694
                // Invalid encoding; not enough data (at least 2 bytes required).
1695
2.12k
                (*flags) |= HTP_URLEN_INVALID_ENCODING;
1696
1697
2.12k
                if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1698
0
                    (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1699
0
                }
1700
1701
2.12k
                switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1702
0
                    case HTP_URL_DECODE_REMOVE_PERCENT:
1703
                        // Do not place anything in output; consume the %.
1704
0
                        rpos++;
1705
0
                        continue;
1706
0
                        break;
1707
2.12k
                    case HTP_URL_DECODE_PRESERVE_PERCENT:
1708
                        // Leave the % in output.
1709
2.12k
                        rpos++;
1710
2.12k
                        break;
1711
0
                    case HTP_URL_DECODE_PROCESS_INVALID:
1712
                        // Cannot decode because there's not enough data.
1713
                        // Leave the % in output.
1714
                        // TODO Configurable handling of %, etc.
1715
0
                        rpos++;
1716
0
                        break;
1717
2.12k
                }
1718
2.12k
            }
1719
1720
            // Did we get an encoded NUL byte?
1721
246k
            if (c == 0) {
1722
16.5k
                if (cfg->decoder_cfgs[ctx].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1723
0
                    (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_encoded_unwanted;
1724
0
                }
1725
1726
16.5k
                (*flags) |= HTP_URLEN_ENCODED_NUL;
1727
1728
16.5k
                if (cfg->decoder_cfgs[ctx].nul_encoded_terminates) {
1729
                    // Terminate the path at the raw NUL byte.
1730
0
                    bstr_adjust_len(input, wpos);
1731
0
                    return 1;
1732
0
                }
1733
16.5k
            }
1734
1735
246k
            data[wpos++] = c;
1736
5.03M
        } else if (c == '+') {
1737
            // Decoding of the plus character is conditional on the configuration.
1738
1739
3.36k
            if (cfg->decoder_cfgs[ctx].plusspace_decode) {
1740
0
                c = 0x20;
1741
0
            }
1742
1743
3.36k
            rpos++;
1744
3.36k
            data[wpos++] = c;
1745
5.03M
        } else {
1746
            // One non-encoded byte.
1747
1748
            // Did we get a raw NUL byte?
1749
5.03M
            if (c == 0) {
1750
196k
                if (cfg->decoder_cfgs[ctx].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1751
0
                    (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_raw_unwanted;
1752
0
                }
1753
1754
196k
                (*flags) |= HTP_URLEN_RAW_NUL;
1755
1756
196k
                if (cfg->decoder_cfgs[ctx].nul_raw_terminates) {
1757
                    // Terminate the path at the encoded NUL byte.
1758
0
                    bstr_adjust_len(input, wpos);
1759
0
                    return HTP_OK;
1760
0
                }
1761
196k
            }
1762
1763
5.03M
            rpos++;
1764
5.03M
            data[wpos++] = c;
1765
5.03M
        }
1766
5.28M
    }
1767
1768
25.2k
    bstr_adjust_len(input, wpos);
1769
1770
25.2k
    return HTP_OK;
1771
25.2k
}
1772
1773
/**
1774
 * Normalize a previously-parsed request URI.
1775
 *
1776
 * @param[in] connp
1777
 * @param[in] incomplete
1778
 * @param[in] normalized
1779
 * @return HTP_OK or HTP_ERROR
1780
 */
1781
64.4k
int htp_normalize_parsed_uri(htp_tx_t *tx, htp_uri_t *incomplete, htp_uri_t *normalized) {
1782
    // Scheme.
1783
64.4k
    if (incomplete->scheme != NULL) {
1784
        // Duplicate and convert to lowercase.
1785
20.7k
        normalized->scheme = bstr_dup_lower(incomplete->scheme);
1786
20.7k
        if (normalized->scheme == NULL) return HTP_ERROR;
1787
20.7k
    }
1788
1789
    // Username.
1790
64.4k
    if (incomplete->username != NULL) {
1791
2.35k
        normalized->username = bstr_dup(incomplete->username);
1792
2.35k
        if (normalized->username == NULL) return HTP_ERROR;
1793
2.35k
        htp_tx_urldecode_uri_inplace(tx, normalized->username);
1794
2.35k
    }
1795
1796
    // Password.
1797
64.4k
    if (incomplete->password != NULL) {
1798
1.21k
        normalized->password = bstr_dup(incomplete->password);
1799
1.21k
        if (normalized->password == NULL) return HTP_ERROR;
1800
1.21k
        htp_tx_urldecode_uri_inplace(tx, normalized->password);
1801
1.21k
    }
1802
1803
    // Hostname.
1804
64.4k
    if (incomplete->hostname != NULL) {
1805
        // We know that incomplete->hostname does not contain
1806
        // port information, so no need to check for it here.
1807
19.7k
        normalized->hostname = bstr_dup(incomplete->hostname);
1808
19.7k
        if (normalized->hostname == NULL) return HTP_ERROR;
1809
19.7k
        htp_tx_urldecode_uri_inplace(tx, normalized->hostname);
1810
19.7k
        htp_normalize_hostname_inplace(normalized->hostname);
1811
19.7k
    }
1812
1813
    // Port.
1814
64.4k
    if (incomplete->port != NULL) {
1815
8.85k
        int64_t port_parsed = htp_parse_positive_integer_whitespace(
1816
8.85k
                bstr_ptr(incomplete->port), bstr_len(incomplete->port), 10);
1817
1818
8.85k
        if (port_parsed < 0) {
1819
            // Failed to parse the port number.
1820
5.66k
            normalized->port_number = -1;
1821
5.66k
            tx->flags |= HTP_HOSTU_INVALID;
1822
5.66k
        } else if ((port_parsed > 0) && (port_parsed < 65536)) {
1823
            // Valid port number.
1824
2.51k
            normalized->port_number = (int) port_parsed;
1825
2.51k
        } else {
1826
            // Port number out of range.
1827
674
            normalized->port_number = -1;
1828
674
            tx->flags |= HTP_HOSTU_INVALID;
1829
674
        }
1830
55.5k
    } else {
1831
55.5k
        normalized->port_number = -1;
1832
55.5k
    }
1833
1834
    // Path.
1835
64.4k
    if (incomplete->path != NULL) {
1836
        // Make a copy of the path, so that we can work on it.
1837
33.5k
        normalized->path = bstr_dup(incomplete->path);
1838
33.5k
        if (normalized->path == NULL) return HTP_ERROR;
1839
1840
        // Decode URL-encoded (and %u-encoded) characters, as well as lowercase,
1841
        // compress separators and convert backslashes.
1842
33.5k
        htp_decode_path_inplace(tx, normalized->path);
1843
1844
        // Handle UTF-8 in the path.
1845
33.5k
        if (tx->cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_convert_bestfit) {
1846
            // Decode Unicode characters into a single-byte stream, using best-fit mapping.
1847
33.5k
            htp_utf8_decode_path_inplace(tx->cfg, tx, normalized->path);
1848
33.5k
        } else {
1849
            // No decoding, but try to validate the path as a UTF-8 stream.
1850
0
            htp_utf8_validate_path(tx, normalized->path);
1851
0
        }
1852
1853
        // RFC normalization.
1854
33.5k
        htp_normalize_uri_path_inplace(normalized->path);
1855
33.5k
    }
1856
1857
    // Query string.
1858
64.4k
    if (incomplete->query != NULL) {
1859
1.58k
        normalized->query = bstr_dup(incomplete->query);
1860
1.58k
        if (normalized->query == NULL) return HTP_ERROR;
1861
1.58k
    }
1862
1863
    // Fragment.
1864
64.4k
    if (incomplete->fragment != NULL) {
1865
1.96k
        normalized->fragment = bstr_dup(incomplete->fragment);
1866
1.96k
        if (normalized->fragment == NULL) return HTP_ERROR;
1867
1.96k
        htp_tx_urldecode_uri_inplace(tx, normalized->fragment);
1868
1.96k
    }
1869
1870
64.4k
    return HTP_OK;
1871
64.4k
}
1872
1873
/**
1874
 * Normalize request hostname. Convert all characters to lowercase and
1875
 * remove trailing dots from the end, if present.
1876
 *
1877
 * @param[in] hostname
1878
 * @return Normalized hostname.
1879
 */
1880
19.7k
bstr *htp_normalize_hostname_inplace(bstr *hostname) {
1881
19.7k
    if (hostname == NULL) return NULL;
1882
1883
19.7k
    bstr_to_lowercase(hostname);
1884
1885
    // Remove dots from the end of the string.    
1886
20.8k
    while (bstr_char_at_end(hostname, 0) == '.') bstr_chop(hostname);
1887
1888
19.7k
    return hostname;
1889
19.7k
}
1890
1891
/**
1892
 * Normalize URL path. This function implements the remove dot segments algorithm
1893
 * specified in RFC 3986, section 5.2.4.
1894
 *
1895
 * @param[in] s
1896
 */
1897
33.5k
void htp_normalize_uri_path_inplace(bstr *s) {
1898
33.5k
    if (s == NULL) return;
1899
1900
33.5k
    unsigned char *data = bstr_ptr(s);
1901
33.5k
    if (data == NULL) return;
1902
33.5k
    size_t len = bstr_len(s);
1903
1904
33.5k
    size_t rpos = 0;
1905
33.5k
    size_t wpos = 0;
1906
1907
33.5k
    int c = -1;
1908
1.30M
    while ((rpos < len)&&(wpos < len)) {
1909
1.27M
        if (c == -1) {
1910
1.17M
            c = data[rpos++];
1911
1.17M
        }
1912
1913
        // A. If the input buffer begins with a prefix of "../" or "./",
1914
        //    then remove that prefix from the input buffer; otherwise,
1915
1.27M
        if (c == '.') {
1916
6.79k
            if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1917
2.16k
                c = -1;
1918
2.16k
                rpos += 2;
1919
2.16k
                continue;
1920
4.63k
            } else if ((rpos < len) && (data[rpos] == '/')) {
1921
2.33k
                c = -1;
1922
2.33k
                rpos += 1;
1923
2.33k
                continue;
1924
2.33k
            }
1925
6.79k
        }
1926
1927
1.26M
        if (c == '/') {
1928
            // B. if the input buffer begins with a prefix of "/./" or "/.",
1929
            //    where "." is a complete path segment, then replace that
1930
            //    prefix with "/" in the input buffer; otherwise,
1931
1.25M
            if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1932
92.3k
                c = '/';
1933
92.3k
                rpos += 2;
1934
92.3k
                continue;
1935
1.16M
            } else if ((rpos + 1 == len) && (data[rpos] == '.')) {
1936
635
                c = '/';
1937
635
                rpos += 1;
1938
635
                continue;
1939
635
            }
1940
1941
            // C. if the input buffer begins with a prefix of "/../" or "/..",
1942
            //    where ".." is a complete path segment, then replace that
1943
            //    prefix with "/" in the input buffer and remove the last
1944
            //    segment and its preceding "/" (if any) from the output
1945
            //    buffer; otherwise,
1946
1.16M
            if ((rpos + 2 < len) && (data[rpos] == '.') && (data[rpos + 1] == '.') && (data[rpos + 2] == '/')) {
1947
8.76k
                c = '/';
1948
8.76k
                rpos += 3;
1949
1950
                // Remove the last segment
1951
103k
                while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1952
8.76k
                if (wpos > 0) wpos--;
1953
8.76k
                continue;
1954
1.15M
            } else if ((rpos + 2 == len) && (data[rpos] == '.') && (data[rpos + 1] == '.')) {
1955
921
                c = '/';
1956
921
                rpos += 2;
1957
1958
                // Remove the last segment
1959
13.4k
                while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1960
921
                if (wpos > 0) wpos--;
1961
921
                continue;
1962
921
            }
1963
1.16M
        }
1964
1965
        // D.  if the input buffer consists only of "." or "..", then remove
1966
        // that from the input buffer; otherwise,
1967
1.16M
        if ((c == '.') && (rpos == len)) {
1968
371
            rpos++;
1969
371
            continue;
1970
371
        }
1971
1972
1.16M
        if ((c == '.') && (rpos + 1 == len) && (data[rpos] == '.')) {
1973
452
            rpos += 2;
1974
452
            continue;
1975
452
        }
1976
1977
        // E.  move the first path segment in the input buffer to the end of
1978
        // the output buffer, including the initial "/" character (if
1979
        // any) and any subsequent characters up to, but not including,
1980
        // the next "/" character or the end of the input buffer.
1981
1.16M
        data[wpos++] = (uint8_t) c;
1982
1983
13.9M
        while ((rpos < len) && (data[rpos] != '/') && (wpos < len)) {
1984
12.7M
            data[wpos++] = data[rpos++];
1985
12.7M
        }
1986
1987
1.16M
        c = -1;
1988
1.16M
    }
1989
1990
33.5k
    bstr_adjust_len(s, wpos);
1991
33.5k
}
1992
1993
/**
1994
 *
1995
 */
1996
0
void fprint_bstr(FILE *stream, const char *name, bstr *b) {
1997
0
    if (b == NULL) {
1998
0
        fprint_raw_data_ex(stream, name, "(null)", 0, 6);
1999
0
        return;
2000
0
    }
2001
2002
0
    fprint_raw_data_ex(stream, name, bstr_ptr(b), 0, bstr_len(b));
2003
0
}
2004
2005
/**
2006
 *
2007
 */
2008
0
void fprint_raw_data(FILE *stream, const char *name, const void *data, size_t len) {
2009
    // may happen for gaps
2010
0
    if (data == NULL) {
2011
0
        fprintf(stream, "\n%s: ptr NULL len %u\n", name, (unsigned int)len);
2012
0
    } else {
2013
0
        fprint_raw_data_ex(stream, name, data, 0, len);
2014
0
    }
2015
0
}
2016
2017
/**
2018
 *
2019
 */
2020
0
void fprint_raw_data_ex(FILE *stream, const char *name, const void *_data, size_t offset, size_t printlen) {
2021
0
    const unsigned char *data = (const unsigned char *) _data;
2022
0
    char buf[160];
2023
0
    size_t len = offset + printlen;
2024
2025
0
    fprintf(stream, "\n%s: ptr %p offset %u len %u\n", name, (void*) data, (unsigned int)offset, (unsigned int)len);
2026
2027
0
    while (offset < len) {
2028
0
        size_t i;
2029
2030
0
        snprintf(buf, sizeof(buf), "%x" PRIx64, (unsigned int) offset);
2031
0
        strlcat(buf, "  ", sizeof(buf));
2032
2033
0
        i = 0;
2034
0
        while (i < 8) {
2035
0
            if (offset + i < len) {
2036
0
                char step[4];
2037
0
                snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2038
0
                strlcat(buf, step, sizeof(buf));
2039
0
            } else {
2040
0
                strlcat(buf, "   ", sizeof(buf));
2041
0
            }
2042
2043
0
            i++;
2044
0
        }
2045
2046
0
        strlcat(buf, " ", sizeof(buf));
2047
2048
0
        i = 8;
2049
0
        while (i < 16) {
2050
0
            if (offset + i < len) {
2051
0
                char step[4];
2052
0
                snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2053
0
                strlcat(buf, step, sizeof(buf));
2054
0
            } else {
2055
0
                strlcat(buf, "   ", sizeof(buf));
2056
0
            }
2057
2058
0
            i++;
2059
0
        }
2060
2061
0
        strlcat(buf, " |", sizeof(buf));
2062
2063
0
        i = 0;
2064
0
        char *p = buf + strlen(buf);
2065
0
        while ((offset + i < len) && (i < 16)) {
2066
0
            uint8_t c = data[offset + i];
2067
2068
0
            if (isprint(c)) {
2069
0
                *p++ = c;
2070
0
            } else {
2071
0
                *p++ = '.';
2072
0
            }
2073
2074
0
            i++;
2075
0
        }
2076
2077
0
        *p++ = '|';
2078
0
        *p++ = '\n';
2079
0
        *p = '\0';
2080
2081
0
        fprintf(stream, "%s", buf);
2082
0
        offset += 16;
2083
0
    }
2084
2085
0
    fprintf(stream, "\n");
2086
0
}
2087
2088
/**
2089
 *
2090
 */
2091
0
char *htp_connp_in_state_as_string(htp_connp_t *connp) {
2092
0
    if (connp == NULL) return "NULL";
2093
2094
0
    if (connp->in_state == htp_connp_REQ_IDLE) return "REQ_IDLE";
2095
0
    if (connp->in_state == htp_connp_REQ_LINE) return "REQ_LINE";
2096
0
    if (connp->in_state == htp_connp_REQ_PROTOCOL) return "REQ_PROTOCOL";
2097
0
    if (connp->in_state == htp_connp_REQ_HEADERS) return "REQ_HEADERS";
2098
0
    if (connp->in_state == htp_connp_REQ_CONNECT_CHECK) return "REQ_CONNECT_CHECK";
2099
0
    if (connp->in_state == htp_connp_REQ_CONNECT_WAIT_RESPONSE) return "REQ_CONNECT_WAIT_RESPONSE";
2100
0
    if (connp->in_state == htp_connp_REQ_BODY_DETERMINE) return "REQ_BODY_DETERMINE";
2101
0
    if (connp->in_state == htp_connp_REQ_BODY_IDENTITY) return "REQ_BODY_IDENTITY";
2102
0
    if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_LENGTH) return "REQ_BODY_CHUNKED_LENGTH";
2103
0
    if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA) return "REQ_BODY_CHUNKED_DATA";
2104
0
    if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA_END) return "REQ_BODY_CHUNKED_DATA_END";
2105
0
    if (connp->in_state == htp_connp_REQ_FINALIZE) return "REQ_FINALIZE";
2106
0
    if (connp->in_state == htp_connp_REQ_IGNORE_DATA_AFTER_HTTP_0_9) return "REQ_IGNORE_DATA_AFTER_HTTP_0_9";
2107
2108
0
    return "UNKNOWN";
2109
0
}
2110
2111
/**
2112
 *
2113
 */
2114
0
char *htp_connp_out_state_as_string(htp_connp_t *connp) {
2115
0
    if (connp == NULL) return "NULL";
2116
2117
0
    if (connp->out_state == htp_connp_RES_IDLE) return "RES_IDLE";
2118
0
    if (connp->out_state == htp_connp_RES_LINE) return "RES_LINE";
2119
0
    if (connp->out_state == htp_connp_RES_HEADERS) return "RES_HEADERS";
2120
0
    if (connp->out_state == htp_connp_RES_BODY_DETERMINE) return "RES_BODY_DETERMINE";
2121
0
    if (connp->out_state == htp_connp_RES_BODY_IDENTITY_CL_KNOWN) return "RES_BODY_IDENTITY_CL_KNOWN";
2122
0
    if (connp->out_state == htp_connp_RES_BODY_IDENTITY_STREAM_CLOSE) return "RES_BODY_IDENTITY_STREAM_CLOSE";
2123
0
    if (connp->out_state == htp_connp_RES_BODY_CHUNKED_LENGTH) return "RES_BODY_CHUNKED_LENGTH";
2124
0
    if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA) return "RES_BODY_CHUNKED_DATA";
2125
0
    if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA_END) return "RES_BODY_CHUNKED_DATA_END";
2126
0
    if (connp->out_state == htp_connp_RES_FINALIZE) return "RES_BODY_FINALIZE";
2127
2128
0
    return "UNKNOWN";
2129
0
}
2130
2131
/**
2132
 *
2133
 */
2134
0
char *htp_tx_request_progress_as_string(htp_tx_t *tx) {
2135
0
    if (tx == NULL) return "NULL";
2136
2137
0
    switch (tx->request_progress) {
2138
0
        case HTP_REQUEST_NOT_STARTED:
2139
0
            return "NOT_STARTED";
2140
0
        case HTP_REQUEST_LINE:
2141
0
            return "REQ_LINE";
2142
0
        case HTP_REQUEST_HEADERS:
2143
0
            return "REQ_HEADERS";
2144
0
        case HTP_REQUEST_BODY:
2145
0
            return "REQ_BODY";
2146
0
        case HTP_REQUEST_TRAILER:
2147
0
            return "REQ_TRAILER";
2148
0
        case HTP_REQUEST_COMPLETE:
2149
0
            return "COMPLETE";
2150
0
    }
2151
2152
0
    return "INVALID";
2153
0
}
2154
2155
/**
2156
 *
2157
 */
2158
0
char *htp_tx_response_progress_as_string(htp_tx_t *tx) {
2159
0
    if (tx == NULL) return "NULL";
2160
2161
0
    switch (tx->response_progress) {
2162
0
        case HTP_RESPONSE_NOT_STARTED:
2163
0
            return "NOT_STARTED";
2164
0
        case HTP_RESPONSE_LINE:
2165
0
            return "RES_LINE";
2166
0
        case HTP_RESPONSE_HEADERS:
2167
0
            return "RES_HEADERS";
2168
0
        case HTP_RESPONSE_BODY:
2169
0
            return "RES_BODY";
2170
0
        case HTP_RESPONSE_TRAILER:
2171
0
            return "RES_TRAILER";
2172
0
        case HTP_RESPONSE_COMPLETE:
2173
0
            return "COMPLETE";
2174
0
    }
2175
2176
0
    return "INVALID";
2177
0
}
2178
2179
0
bstr *htp_unparse_uri_noencode(htp_uri_t *uri) {
2180
0
    if (uri == NULL) return NULL;    
2181
2182
    // On the first pass determine the length of the final string
2183
0
    size_t len = 0;
2184
2185
0
    if (uri->scheme != NULL) {
2186
0
        len += bstr_len(uri->scheme);
2187
0
        len += 3; // "://"
2188
0
    }
2189
2190
0
    if ((uri->username != NULL) || (uri->password != NULL)) {
2191
0
        if (uri->username != NULL) {
2192
0
            len += bstr_len(uri->username);
2193
0
        }
2194
2195
0
        len += 1; // ":"
2196
2197
0
        if (uri->password != NULL) {
2198
0
            len += bstr_len(uri->password);
2199
0
        }
2200
2201
0
        len += 1; // "@"
2202
0
    }
2203
2204
0
    if (uri->hostname != NULL) {
2205
0
        len += bstr_len(uri->hostname);
2206
0
    }
2207
2208
0
    if (uri->port != NULL) {
2209
0
        len += 1; // ":"
2210
0
        len += bstr_len(uri->port);
2211
0
    }
2212
2213
0
    if (uri->path != NULL) {
2214
0
        len += bstr_len(uri->path);
2215
0
    }
2216
2217
0
    if (uri->query != NULL) {
2218
0
        len += 1; // "?"
2219
0
        len += bstr_len(uri->query);
2220
0
    }
2221
2222
0
    if (uri->fragment != NULL) {
2223
0
        len += 1; // "#"
2224
0
        len += bstr_len(uri->fragment);
2225
0
    }
2226
2227
    // On the second pass construct the string
2228
0
    bstr *r = bstr_alloc(len);
2229
0
    if (r == NULL) return NULL;    
2230
2231
0
    if (uri->scheme != NULL) {
2232
0
        bstr_add_noex(r, uri->scheme);
2233
0
        bstr_add_c_noex(r, "://");
2234
0
    }
2235
2236
0
    if ((uri->username != NULL) || (uri->password != NULL)) {
2237
0
        if (uri->username != NULL) {
2238
0
            bstr_add_noex(r, uri->username);
2239
0
        }
2240
2241
0
        bstr_add_c_noex(r, ":");
2242
2243
0
        if (uri->password != NULL) {
2244
0
            bstr_add_noex(r, uri->password);
2245
0
        }
2246
2247
0
        bstr_add_c_noex(r, "@");
2248
0
    }
2249
2250
0
    if (uri->hostname != NULL) {
2251
0
        bstr_add_noex(r, uri->hostname);
2252
0
    }
2253
2254
0
    if (uri->port != NULL) {
2255
0
        bstr_add_c_noex(r, ":");
2256
0
        bstr_add_noex(r, uri->port);
2257
0
    }
2258
2259
0
    if (uri->path != NULL) {
2260
0
        bstr_add_noex(r, uri->path);
2261
0
    }
2262
2263
0
    if (uri->query != NULL) {
2264
0
        bstr_add_c_noex(r, "?");
2265
0
        bstr_add_noex(r, uri->query);
2266
0
    }
2267
2268
0
    if (uri->fragment != NULL) {
2269
0
        bstr_add_c_noex(r, "#");
2270
0
        bstr_add_noex(r, uri->fragment);
2271
0
    }
2272
2273
0
    return r;
2274
0
}
2275
2276
/**
2277
 * Determine if the information provided on the response line
2278
 * is good enough. Browsers are lax when it comes to response
2279
 * line parsing. In most cases they will only look for the
2280
 * words "http" at the beginning.
2281
 *
2282
 * @param[in] data pointer to bytearray
2283
 * @param[in] len length in bytes of data
2284
 * @return 1 for good enough or 0 for not good enough
2285
 */
2286
1.51M
int htp_treat_response_line_as_body(const uint8_t *data, size_t len) {
2287
    // Browser behavior:
2288
    //      Firefox 3.5.x: (?i)^\s*http
2289
    //      IE: (?i)^\s*http\s*/
2290
    //      Safari: ^HTTP/\d+\.\d+\s+\d{3}
2291
1.51M
    size_t pos = 0;
2292
2293
1.51M
    if (data == NULL) return 1;
2294
3.00M
    while ((pos < len) && (htp_is_space(data[pos]) || data[pos] == 0)) pos++;
2295
2296
1.51M
    if (len < pos + 4) return 1;
2297
2298
184k
    if ((data[pos] != 'H') && (data[pos] != 'h')) return 1;
2299
79.8k
    if ((data[pos+1] != 'T') && (data[pos+1] != 't')) return 1;
2300
75.2k
    if ((data[pos+2] != 'T') && (data[pos+2] != 't')) return 1;
2301
72.7k
    if ((data[pos+3] != 'P') && (data[pos+3] != 'p')) return 1;
2302
2303
68.9k
    return 0;
2304
72.7k
}
2305
2306
/**
2307
 * Run the REQUEST_BODY_DATA hook.
2308
 *
2309
 * @param[in] connp
2310
 * @param[in] d
2311
 */
2312
103k
htp_status_t htp_req_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2313
    // Do not invoke callbacks with an empty data chunk
2314
103k
    if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2315
2316
    // Do not invoke callbacks without a transaction.
2317
103k
    if (connp->in_tx == NULL) return HTP_OK;
2318
2319
    // Run transaction hooks first
2320
103k
    htp_status_t rc = htp_hook_run_all(connp->in_tx->hook_request_body_data, d);
2321
103k
    if (rc != HTP_OK) return rc;
2322
2323
    // Run configuration hooks second
2324
103k
    rc = htp_hook_run_all(connp->cfg->hook_request_body_data, d);
2325
103k
    if (rc != HTP_OK) return rc;
2326
2327
    // On PUT requests, treat request body as file
2328
103k
    if (connp->put_file != NULL) {
2329
0
        htp_file_data_t file_data;
2330
2331
0
        file_data.data = d->data;
2332
0
        file_data.len = d->len;
2333
0
        file_data.file = connp->put_file;
2334
0
        file_data.file->len += d->len;
2335
2336
0
        rc = htp_hook_run_all(connp->cfg->hook_request_file_data, &file_data);
2337
0
        if (rc != HTP_OK) return rc;
2338
0
    }
2339
2340
103k
    return HTP_OK;
2341
103k
}
2342
2343
/**
2344
 * Run the RESPONSE_BODY_DATA hook.
2345
 *
2346
 * @param[in] connp
2347
 * @param[in] d
2348
 */
2349
2.47M
htp_status_t htp_res_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2350
    // Do not invoke callbacks with an empty data chunk.
2351
2.47M
    if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2352
2353
    // Run transaction hooks first
2354
1.93M
    htp_status_t rc = htp_hook_run_all(connp->out_tx->hook_response_body_data, d);
2355
1.93M
    if (rc != HTP_OK) return rc;
2356
2357
    // Run configuration hooks second
2358
1.93M
    rc = htp_hook_run_all(connp->cfg->hook_response_body_data, d);
2359
1.93M
    if (rc != HTP_OK) return rc;
2360
2361
1.93M
    return HTP_OK;
2362
1.93M
}
2363
2364
/**
2365
 * Parses the provided memory region, extracting the double-quoted string.
2366
 *
2367
 * @param[in] data
2368
 * @param[in] len
2369
 * @param[out] out
2370
 * @param[out] endoffset
2371
 * @return HTP_OK on success, HTP_DECLINED if the input is not well formed, and HTP_ERROR on fatal errors.
2372
 */
2373
0
htp_status_t htp_extract_quoted_string_as_bstr(unsigned char *data, size_t len, bstr **out, size_t *endoffset) {
2374
0
    if ((data == NULL) || (out == NULL)) return HTP_ERROR;
2375
2376
0
    if (len == 0) return HTP_DECLINED;
2377
2378
0
    size_t pos = 0;
2379
2380
    // Check that the first character is a double quote.
2381
0
    if (data[pos] != '"') return HTP_DECLINED;
2382
2383
    // Step over the double quote.
2384
0
    pos++;
2385
0
    if (pos == len) return HTP_DECLINED;
2386
2387
    // Calculate the length of the resulting string.
2388
0
    size_t escaped_chars = 0;
2389
0
    while (pos < len) {
2390
0
        if (data[pos] == '\\') {
2391
0
            if (pos + 1 < len) {
2392
0
                escaped_chars++;
2393
0
                pos += 2;
2394
0
                continue;
2395
0
            }
2396
0
        } else if (data[pos] == '"') {
2397
0
            break;
2398
0
        }
2399
2400
0
        pos++;
2401
0
    }
2402
2403
    // Have we reached the end of input without seeing the terminating double quote?
2404
0
    if (pos == len) return HTP_DECLINED;
2405
2406
    // Copy the data and unescape it as necessary.
2407
0
    size_t outlen = pos - 1 - escaped_chars;
2408
0
    *out = bstr_alloc(outlen);
2409
0
    if (*out == NULL) return HTP_ERROR;
2410
0
    unsigned char *outptr = bstr_ptr(*out);
2411
0
    size_t outpos = 0;
2412
2413
0
    pos = 1;
2414
0
    while ((pos < len) && (outpos < outlen)) {
2415
        // TODO We are not properly unescaping test here, we're only
2416
        //      handling escaped double quotes.
2417
0
        if (data[pos] == '\\') {
2418
0
            if (pos + 1 < len) {
2419
0
                outptr[outpos++] = data[pos + 1];
2420
0
                pos += 2;
2421
0
                continue;
2422
0
            }
2423
0
        } else if (data[pos] == '"') {
2424
0
            break;
2425
0
        }
2426
2427
0
        outptr[outpos++] = data[pos++];
2428
0
    }
2429
2430
0
    bstr_adjust_len(*out, outlen);
2431
2432
0
    if (endoffset != NULL) {
2433
0
        *endoffset = pos;
2434
0
    }
2435
2436
0
    return HTP_OK;
2437
0
}
2438
2439
433
htp_status_t htp_parse_ct_header(bstr *header, bstr **ct) {
2440
433
    if ((header == NULL) || (ct == NULL)) return HTP_ERROR;
2441
2442
433
    unsigned char *data = bstr_ptr(header);
2443
433
    size_t len = bstr_len(header);
2444
2445
    // The assumption here is that the header value we receive
2446
    // here has been left-trimmed, which means the starting position
2447
    // is on the media type. On some platforms that may not be the
2448
    // case, and we may need to do the left-trim ourselves.
2449
2450
    // Find the end of the MIME type, using the same approach PHP 5.4.3 uses.
2451
433
    size_t pos = 0;
2452
71.2k
    while ((pos < len) && (data[pos] != ';') && (data[pos] != ',') && (data[pos] != ' ')) pos++;
2453
2454
433
    *ct = bstr_dup_ex(header, 0, pos);
2455
433
    if (*ct == NULL) return HTP_ERROR;
2456
2457
433
    bstr_to_lowercase(*ct);
2458
2459
433
    return HTP_OK;
2460
433
}
2461
2462
/**
2463
 * Implements relaxed (not strictly RFC) hostname validation.
2464
 * 
2465
 * @param[in] hostname
2466
 * @return 1 if the supplied hostname is valid; 0 if it is not.
2467
 */
2468
27.6k
int htp_validate_hostname(bstr *hostname) {
2469
27.6k
    unsigned char *data = bstr_ptr(hostname);
2470
27.6k
    size_t len = bstr_len(hostname);
2471
27.6k
    size_t startpos = 0;
2472
27.6k
    size_t pos = 0;
2473
2474
27.6k
    if ((len == 0) || (len > 255)) return 0;
2475
2476
18.6k
    if (data[0] == '[') {
2477
        // only ipv6 possible
2478
3.69k
        if (len < 2 || len - 2 >= INET6_ADDRSTRLEN) {
2479
1.05k
            return 0;
2480
1.05k
        }
2481
2.63k
        char dst[sizeof(struct in6_addr)];
2482
2.63k
        char str[INET6_ADDRSTRLEN];
2483
2.63k
        memcpy(str, data+1, len-2);
2484
2.63k
        str[len-2] = 0;
2485
2.63k
        return inet_pton(AF_INET6, str, dst);
2486
3.69k
    }
2487
15.6k
    while (pos < len) {
2488
        // Validate label characters.
2489
15.3k
        startpos = pos;
2490
45.7k
        while ((pos < len) && (data[pos] != '.')) {
2491
39.9k
            unsigned char c = data[pos];
2492
            // According to the RFC, the underscore is not allowed in a label, but
2493
            // we allow it here because we think it's often seen in practice.
2494
39.9k
            if (!(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
2495
39.9k
                        ((c >= '0') && (c <= '9')) ||
2496
39.9k
                         (c == '-') || (c == '_')))
2497
9.56k
            {
2498
9.56k
                return 0;
2499
9.56k
            }
2500
2501
30.3k
            pos++;
2502
30.3k
        }
2503
2504
        // Validate label length.
2505
5.82k
        if ((pos - startpos == 0) || (pos - startpos > 63)) return 0;
2506
2507
5.20k
        if (pos >= len) return 1; // No more data after label.
2508
2509
        // How many dots are there?
2510
1.82k
        startpos = pos;
2511
5.21k
        while ((pos < len) && (data[pos] == '.')) pos++;
2512
2513
1.82k
        if (pos - startpos != 1) return 0; // Exactly one dot expected.
2514
1.82k
    }
2515
2516
288
    return 1;
2517
14.9k
}
2518
2519
186k
void htp_uri_free(htp_uri_t *uri) {
2520
186k
    if (uri == NULL) return;
2521
2522
185k
    bstr_free(uri->scheme);
2523
185k
    bstr_free(uri->username);
2524
185k
    bstr_free(uri->password);
2525
185k
    bstr_free(uri->hostname);
2526
185k
    bstr_free(uri->port);
2527
185k
    bstr_free(uri->path);
2528
185k
    bstr_free(uri->query);
2529
185k
    bstr_free(uri->fragment);
2530
2531
185k
    free(uri);
2532
185k
}
2533
2534
185k
htp_uri_t *htp_uri_alloc(void) {
2535
185k
    htp_uri_t *u = calloc(1, sizeof (htp_uri_t));
2536
185k
    if (u == NULL) return NULL;
2537
2538
185k
    u->port_number = -1;
2539
2540
185k
    return u;
2541
185k
}
2542
2543
0
char *htp_get_version(void) {
2544
0
    return HTP_VERSION_STRING_FULL;
2545
0
}
2546
2547
/**
2548
 * Tells if a header value (haystack) contains a token (needle)
2549
 * This is done with a caseless comparison
2550
 *
2551
 * @param[in] hvp header value pointer
2552
 * @param[in] hvlen length of header value buffer
2553
 * @param[in] value token to look for (null-terminated string), should be a lowercase constant
2554
 * @return HTP_OK if the header has the token; HTP_ERROR if it has not.
2555
 */
2556
0
htp_status_t htp_header_has_token(const unsigned char *hvp, size_t hvlen, const unsigned char *value) {
2557
0
    int state = 0;
2558
    // offset to compare in value
2559
0
    size_t v_off = 0;
2560
    // The header value is a list of comma-separated tokens (with additional spaces)
2561
0
    for (size_t i = 0; i < hvlen; i++) {
2562
0
        switch (state) {
2563
0
            case 0:
2564
0
                if (v_off == 0 && htp_is_space(hvp[i])) {
2565
                    // skip leading space
2566
0
                    continue;
2567
0
                }
2568
0
                if (tolower(hvp[i]) == value[v_off]) {
2569
0
                    v_off++;
2570
0
                    if (value[v_off] == 0) {
2571
                        // finish validation if end of token
2572
0
                        state = 2;
2573
0
                    }
2574
0
                    continue;
2575
0
                } else {
2576
                    // wait for a new token
2577
0
                    v_off = 0;
2578
0
                    state = 1;
2579
0
                }
2580
                // fallthrough
2581
0
            case 1:
2582
0
                if (hvp[i] == ',') {
2583
                    // start of next token
2584
0
                    state = 0;
2585
0
                }
2586
0
                break;
2587
0
            case 2:
2588
0
                if (hvp[i] == ',') {
2589
0
                    return HTP_OK;
2590
0
                }
2591
0
                if (!htp_is_space(hvp[i])) {
2592
                    // trailing junk in token, wait for a next one
2593
0
                    v_off = 0;
2594
0
                    state = 1;
2595
0
                }
2596
0
        }
2597
0
    }
2598
0
    if (state == 2) {
2599
0
        return HTP_OK;
2600
0
    }
2601
0
    return HTP_ERROR;
2602
0
}