/src/lighttpd1.4/src/burl.c

Source
/*
 * burl - buffer URL normalization
 *
 * Copyright(c) 2018 Glenn Strauss gstrauss()gluelogic.com  All rights reserved
 * License: BSD 3-clause (same as lighttpd)
 */
#include "first.h"
#include "burl.h"

#include <string.h>

#include "buffer.h"
#include "base64.h"

static const char hex_chars_uc[] = "0123456789ABCDEF";

/* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
static const char encoded_chars_http_uri_reqd[] = {
  /*
  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
  */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  00 -  0F control chars */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  10 -  1F */
  1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  20 -  2F space " # % */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,  /*  30 -  3F < > */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  40 -  4F */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,  /*  50 -  5F [ \ ] ^ */
  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /*  60 -  6F ` */
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,  /*  70 -  7F { | } DEL */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  80 -  8F */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  90 -  9F */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  A0 -  AF */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  B0 -  BF */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  C0 -  CF */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  D0 -  DF */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  E0 -  EF */
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /*  F0 -  FF */
};


/* c (char) and n (nibble) MUST be unsigned integer types */
#define li_cton(c,n) \
  (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))

/* b (byte) MUST be unsigned integer type
 * https://en.wikipedia.org/wiki/UTF-8
 * detect invalid UTF-8 byte and byte in overlong encoding of 7-bit ASCII
 * (but does not detect other invalid/overlong multibyte encoding sequences) */
#define li_utf8_invalid_byte(b) light_utf8_invalid_byte(b)


static int burl_is_unreserved (const int c)
{
    return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~');
}


static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs)
{
    int j = i;
    const int used = (int)buffer_clen(b);
    const unsigned char * const s = (unsigned char *)b->ptr;
    unsigned char * const p =
      (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
    unsigned int n1, n2;
    memcpy(p, s, (size_t)i);
    for (; i < used; ++i, ++j) {
        if (!encoded_chars_http_uri_reqd[s[i]]) {
            p[j] = s[i];
            if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = j;
        }
        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
            const unsigned int x = (n1 << 4) | n2;
            if (burl_is_unreserved(x)) {
                p[j] = x;
            }
            else {
                p[j]   = '%';
                p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
                p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
                if (li_utf8_invalid_byte(x)) qs = -2;
            }
            i+=2;
        }
        else if (s[i] == '#') break; /* ignore fragment */
        else {
            p[j]   = '%';
            p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
            p[++j] = hex_chars_uc[s[i] & 0xF];
            if (li_utf8_invalid_byte(s[i])) qs = -2;
        }
    }
    buffer_copy_string_len(b, (char *)p, (size_t)j);
    return qs;
}


static int burl_normalize_basic_unreserved (buffer *b, buffer *t)
{
    const unsigned char * const s = (unsigned char *)b->ptr;
    const int used = (int)buffer_clen(b);
    unsigned int n1, n2, x;
    int qs = -1;

    for (int i = 0; i < used; ++i) {
        if (!encoded_chars_http_uri_reqd[s[i]]) {
            if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = i;
        }
        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
                 && !burl_is_unreserved((x = (n1 << 4) | n2))) {
            if (li_utf8_invalid_byte(x)) qs = -2;
            if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
            if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
            i+=2;
        }
        else if (s[i] == '#') { /* ignore fragment */
            buffer_truncate(b, (size_t)i);
            break;
        }
        else {
            qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
            break;
        }
    }

    return qs;
}


static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs)
{
    int j = i;
    const int used = (int)buffer_clen(b);
    const unsigned char * const s = (unsigned char *)b->ptr;
    unsigned char * const p =
      (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1);
    unsigned int n1, n2;
    int invalid_utf8 = 0;
    memcpy(p, s, (size_t)i);
    for (; i < used; ++i, ++j) {
        if (!encoded_chars_http_uri_reqd[s[i]]) {
            p[j] = s[i];
            if (__builtin_expect( (s[i] == '?'), 0)) qs = j;
        }
        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
            const unsigned int x = (n1 << 4) | n2;
            if (!encoded_chars_http_uri_reqd[x]
                && (qs < 0
                    ? (x != '/' && x != '?')
                    : (x != '&' && x != '=' && x != ';' && x != '+'))) {
                p[j] = x;
            }
            else {
                p[j]   = '%';
                p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/
                p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/
                invalid_utf8 |= li_utf8_invalid_byte(x);
            }
            i+=2;
        }
        else if (s[i] == '#') break; /* ignore fragment */
        else {
            p[j]   = '%';
            p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
            p[++j] = hex_chars_uc[s[i] & 0xF];
            invalid_utf8 |= li_utf8_invalid_byte(s[i]);
        }
    }
    buffer_copy_string_len(b, (char *)p, (size_t)j);
    return !invalid_utf8 ? qs : -2;
}


static int burl_normalize_basic_required (buffer *b, buffer *t)
{
    const unsigned char * const s = (unsigned char *)b->ptr;
    const int used = (int)buffer_clen(b);
    unsigned int n1, n2, x;
    int qs = -1;
    int invalid_utf8 = 0;

    for (int i = 0; i < used; ++i) {
        if (!encoded_chars_http_uri_reqd[s[i]]) {
            if (s[i] == '?') qs = i;
        }
        else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
                 && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)]
                     || (qs < 0
                         ? (x == '/' || x == '?')
                         : (x == '&' || x == '=' || x == ';' || x == '+')))) {
            invalid_utf8 |= li_utf8_invalid_byte(x);
            if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
            if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
            i+=2;
        }
        else if (s[i] == '#') { /* ignore fragment */
            buffer_truncate(b, (size_t)i);
            break;
        }
        else {
            qs = burl_normalize_basic_required_fix(b, t, i, qs);
            break;
        }
    }

    return !invalid_utf8 ? qs : -2;
}


static int burl_contains_ctrls (const buffer *b)
{
    const char * const s = b->ptr;
    const int used = (int)buffer_clen(b);
    for (int i = 0; i < used; ++i) {
        if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F')))
            return 1;
    }
    return 0;
}


static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
{
    char * const s = b->ptr;
    const int used = (int)buffer_clen(b);
    int j = i;
    for (; i < used; ++i, ++j) {
        s[j] = s[i];
        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
            s[j] = '+';
            i+=2;
        }
    }
    buffer_truncate(b, j);
}


static void burl_normalize_qs20_to_plus (buffer *b, int qs)
{
    const char * const s = b->ptr;
    const int used = qs < 0 ? 0 : (int)buffer_clen(b);
    int i;
    if (qs < 0) return;
    for (i = qs+1; i < used; ++i) {
        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
    }
    if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
}


static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
{
    char * const s = b->ptr;
    const int blen = (int)buffer_clen(b);
    const int used = qs < 0 ? blen : qs;
    int j = i;
    for (; i < used; ++i, ++j) {
        s[j] = s[i];
        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
            s[j] = '/';
            i+=2;
        }
    }
    if (qs >= 0) {
        const int qslen = blen - qs;
        memmove(s+j, s+qs, (size_t)qslen);
        qs = j;
        j += qslen;
    }
    buffer_truncate(b, j);
    return qs;
}


static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
{
    /*("%2F" must already have been uppercased during normalization)*/
    const char * const s = b->ptr;
    const int used = qs < 0 ? (int)buffer_clen(b) : qs;
    for (int i = 0; i < used; ++i) {
        if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
            return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
              ? burl_normalize_2F_to_slash_fix(b, qs, i)
              : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/
        }
    }
    return qs;
}


static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags)
{
    const unsigned char * const s = (unsigned char *)b->ptr;
    const int used = (int)buffer_clen(b);
    int path_simplify = 0;
    for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
        if (s[i] == '.' && (s[i+1] != '.' || ++i)
            && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) {
            path_simplify = 1;
            break;
        }
        while (i < len && s[i] != '/') ++i;
        if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/
            path_simplify = 1;
            break;
        }
    }

    if (path_simplify) {
        if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
        if (qs >= 0) {
            buffer_copy_string_len(t, b->ptr+qs, used - qs);
            buffer_truncate(b, qs);
        }

        buffer_path_simplify(b);

        if (qs >= 0) {
            qs = (int)buffer_clen(b);
            buffer_append_string_len(b, BUF_PTR_LEN(t));
        }
    }

    return qs;
}


__attribute_cold__
__attribute_noinline__
__attribute_pure__
static int burl_scan_qmark (const buffer * const b) {
    const char * const qmark = strchr(b->ptr, '?');
    return qmark ? (int)(qmark - b->ptr) : -1;
}


int burl_normalize (buffer *b, buffer *t, int flags)
{
    int qs;

  #if defined(_WIN32) || defined(__CYGWIN__)
    /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
     * convert to '/' for consistency before percent-encoding
     * normalization which will convert '\\' to "%5C" in the URL.
     * (Clients still should not be sending '\\' unencoded in requests.) */
    if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
        for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) {
            if (*p == '\\') *p = '/';
        }
    }
  #endif

    qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
      ? burl_normalize_basic_required(b, t)
      : burl_normalize_basic_unreserved(b, t);
    if (-2 == qs) {
        if (flags & HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT) return -2;
        qs = burl_scan_qmark(b);
    }

    if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
        if (burl_contains_ctrls(b)) return -2;
    }

    if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
                |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
        qs = burl_normalize_2F_to_slash(b, qs, flags);
        if (-2 == qs) return -2;
    }

    if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
                |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
        qs = burl_normalize_path(b, t, qs, flags);
        if (-2 == qs) return -2;
    }

    if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
        if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
    }

    return qs;
}


static void burl_append_encode_nde (buffer * const b, const char * const str, const size_t len)
{
    /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~
     * unless already percent-encoded (does not double-encode) */
    /* Note: not checking for invalid UTF-8 */
    char * const p = buffer_string_prepare_append(b, len*3);
    unsigned int n1, n2;
    int j = 0;
    for (unsigned int i = 0; i < len; ++i, ++j) {
        if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
            const unsigned int x = (n1 << 4) | n2;
            if (burl_is_unreserved((int)x)) {
                p[j] = (char)x;
            }
            else { /* leave UTF-8, control chars, and required chars encoded */
                p[j]   = '%';
                p[++j] = str[i+1];
                p[++j] = str[i+2];
            }
            i+=2;
        }
        else if (burl_is_unreserved(str[i])) {
            p[j] = str[i];
        }
        else {
            p[j]   = '%';
            p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
            p[++j] = hex_chars_uc[str[i] & 0xF];
        }
    }
    buffer_commit(b, j);
}


static void burl_append_encode_psnde (buffer * const b, const char * const str, const size_t len)
{
    /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~ plus /
     * unless already percent-encoded (does not double-encode) */
    /* Note: not checking for invalid UTF-8 */
    char * const p = buffer_string_prepare_append(b, len*3);
    unsigned int n1, n2;
    int j = 0;
    for (unsigned int i = 0; i < len; ++i, ++j) {
        if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
            const unsigned int x = (n1 << 4) | n2;
            if (burl_is_unreserved((int)x)) {
                p[j] = (char)x;
            }
            else { /* leave UTF-8, control chars, and required chars encoded */
                p[j]   = '%';
                p[++j] = str[i+1];
                p[++j] = str[i+2];
            }
            i+=2;
        }
        else if (burl_is_unreserved(str[i]) || str[i] == '/') {
            p[j] = str[i];
        }
        else {
            p[j]   = '%';
            p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
            p[++j] = hex_chars_uc[str[i] & 0xF];
        }
    }
    buffer_commit(b, j);
}


static void burl_append_encode_all (buffer * const b, const char * const str, const size_t len)
{
    /* percent-encodes everything except unreserved  - . 0-9 A-Z _ a-z ~
     * Note: double-encodes any existing '%') */
    /* Note: not checking for invalid UTF-8 */
    char * const p = buffer_string_prepare_append(b, len*3);
    int j = 0;
    for (unsigned int i = 0; i < len; ++i, ++j) {
        if (burl_is_unreserved(str[i])) {
            p[j] = str[i];
        }
        else {
            p[j]   = '%';
            p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
            p[++j] = hex_chars_uc[str[i] & 0xF];
        }
    }
    buffer_commit(b, j);
}


static void burl_offset_tolower (buffer * const b, const size_t off)
{
    /*(skips over all percent-encodings, including encoding of alpha chars)*/
    for (char *p = b->ptr+off; p[0]; ++p) {
        if (light_isupper(p[0])) p[0] |= 0x20;
        else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
            p+=2;
    }
}


static void burl_offset_toupper (buffer * const b, const size_t off)
{
    /*(skips over all percent-encodings, including encoding of alpha chars)*/
    for (char *p = b->ptr+off; p[0]; ++p) {
        if (light_islower(p[0])) p[0] &= 0xdf;
        else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
            p+=2;
    }
}


void burl_append (buffer * const b, const char * const str, const size_t len, const int flags)
{
    size_t off = 0;

    if (0 == len) return;

    if (0 == flags) {
        buffer_append_string_len(b, str, len);
        return;
    }

    if (flags & (BURL_TOUPPER|BURL_TOLOWER)) off = buffer_clen(b);

    if (flags & BURL_ENCODE_NONE) {
        buffer_append_string_len(b, str, len);
    }
    else if (flags & BURL_ENCODE_ALL) {
        burl_append_encode_all(b, str, len);
    }
    else if (flags & BURL_ENCODE_NDE) {
        burl_append_encode_nde(b, str, len);
    }
    else if (flags & BURL_ENCODE_PSNDE) {
        burl_append_encode_psnde(b, str, len);
    }
    else if (flags & BURL_ENCODE_B64U) {
        const unsigned char *s = (const unsigned char *)str;
        buffer_append_base64_encode_no_padding(b, s, len, BASE64_URL);
    }
    else if (flags & BURL_DECODE_B64U) {
        buffer_append_base64_decode(b, str, len, BASE64_URL);
    }

    /* note: not normalizing str, which could come from arbitrary header,
     * so it is possible that alpha chars are percent-encoded upper/lowercase */
    if (flags & (BURL_TOLOWER|BURL_TOUPPER)) {
        (flags & BURL_TOLOWER)
          ? burl_offset_tolower(b, off)  /*(flags & BURL_TOLOWER)*/
          : burl_offset_toupper(b, off); /*(flags & BURL_TOUPPER)*/
    }
}

Coverage Report

Created: 2025-10-13 06:52

Line	Count	Source
1		/*
2		* burl - buffer URL normalization
3		*
4		* Copyright(c) 2018 Glenn Strauss gstrauss()gluelogic.com All rights reserved
5		* License: BSD 3-clause (same as lighttpd)
6		*/
7		#include "first.h"
8		#include "burl.h"
9
10		#include <string.h>
11
12		#include "buffer.h"
13		#include "base64.h"
14
15		static const char hex_chars_uc[] = "0123456789ABCDEF";
16
17		/* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */
18		static const char encoded_chars_http_uri_reqd[] = {
19		/*
20		0 1 2 3 4 5 6 7 8 9 A B C D E F
21		*/
22		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00 - 0F control chars */
23		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10 - 1F */
24		1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2F space " # % */
25		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 30 - 3F < > */
26		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4F */
27		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 50 - 5F [ \ ] ^ */
28		1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6F ` */
29		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* 70 - 7F { \| } DEL */
30		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - 8F */
31		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 90 - 9F */
32		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A0 - AF */
33		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B0 - BF */
34		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0 - CF */
35		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0 - DF */
36		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* E0 - EF */
37		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* F0 - FF */
38		};
39
40
41		/* c (char) and n (nibble) MUST be unsigned integer types */
42		#define li_cton(c,n) \
43	42.9M	(((n) = (c) - '0') <= 9 \|\| (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0))
44
45		/* b (byte) MUST be unsigned integer type
46		* https://en.wikipedia.org/wiki/UTF-8
47		* detect invalid UTF-8 byte and byte in overlong encoding of 7-bit ASCII
48		* (but does not detect other invalid/overlong multibyte encoding sequences) */
49	42.7M	#define li_utf8_invalid_byte(b) light_utf8_invalid_byte(b)
50
51
52		static int burl_is_unreserved (const int c)
53	9.63k	{
54	9.63k	return (light_isalnum(c) \|\| c == '-' \|\| c == '.' \|\| c == '_' \|\| c == '~');
55	9.63k	}
56
57
58		static int burl_normalize_basic_unreserved_fix (buffer b, buffer t, int i, int qs)
59	590	{
60	590	int j = i;
61	590	const int used = (int)buffer_clen(b);
62	590	const unsigned char * const s = (unsigned char *)b->ptr;
63	590	unsigned char * const p =
64	590	(unsigned char )buffer_string_prepare_copy(t,i+(used-i)3+1);
65	590	unsigned int n1, n2;
66	590	memcpy(p, s, (size_t)i);
67	22.2M	for (; i < used; ++i, ++j) {
68	22.2M	if (!encoded_chars_http_uri_reqd[s[i]]) {
69	1.82M	p[j] = s[i];
70	1.82M	if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = j;
71	1.82M	}
72	20.4M	else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
73	7.32k	const unsigned int x = (n1 << 4) \| n2;
74	7.32k	if (burl_is_unreserved(x)) {
75	2.33k	p[j] = x;
76	2.33k	}
77	4.99k	else {
78	4.99k	p[j] = '%';
79	4.99k	p[++j] = hex_chars_uc[n1]; /(s[i+1] & 0xdf)/
80	4.99k	p[++j] = hex_chars_uc[n2]; /(s[i+2] & 0xdf)/
81	4.99k	if (li_utf8_invalid_byte(x)) qs = -2;
82	4.99k	}
83	7.32k	i+=2;
84	7.32k	}
85	20.4M	else if (s[i] == '#') break; /* ignore fragment */
86	20.4M	else {
87	20.4M	p[j] = '%';
88	20.4M	p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
89	20.4M	p[++j] = hex_chars_uc[s[i] & 0xF];
90	20.4M	if (li_utf8_invalid_byte(s[i])) qs = -2;
91	20.4M	}
92	22.2M	}
93	590	buffer_copy_string_len(b, (char *)p, (size_t)j);
94	590	return qs;
95	590	}
96
97
98		static int burl_normalize_basic_unreserved (buffer b, buffer t)
99	755	{
100	755	const unsigned char * const s = (unsigned char *)b->ptr;
101	755	const int used = (int)buffer_clen(b);
102	755	unsigned int n1, n2, x;
103	755	int qs = -1;
104
105	742k	for (int i = 0; i < used; ++i) {
106	742k	if (!encoded_chars_http_uri_reqd[s[i]]) {
107	739k	if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = i;
108	739k	}
109	2.84k	else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
110	2.30k	&& !burl_is_unreserved((x = (n1 << 4) \| n2))) {
111	2.25k	if (li_utf8_invalid_byte(x)) qs = -2;
112	2.25k	if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
113	2.25k	if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
114	2.25k	i+=2;
115	2.25k	}
116	591	else if (s[i] == '#') { /* ignore fragment */
117	1	buffer_truncate(b, (size_t)i);
118	1	break;
119	1	}
120	590	else {
121	590	qs = burl_normalize_basic_unreserved_fix(b, t, i, qs);
122	590	break;
123	590	}
124	742k	}
125
126	755	return qs;
127	755	}
128
129
130		static int burl_normalize_basic_required_fix (buffer b, buffer t, int i, int qs)
131	678	{
132	678	int j = i;
133	678	const int used = (int)buffer_clen(b);
134	678	const unsigned char * const s = (unsigned char *)b->ptr;
135	678	unsigned char * const p =
136	678	(unsigned char )buffer_string_prepare_copy(t,i+(used-i)3+1);
137	678	unsigned int n1, n2;
138	678	int invalid_utf8 = 0;
139	678	memcpy(p, s, (size_t)i);
140	23.9M	for (; i < used; ++i, ++j) {
141	23.9M	if (!encoded_chars_http_uri_reqd[s[i]]) {
142	1.63M	p[j] = s[i];
143	1.63M	if (__builtin_expect( (s[i] == '?'), 0)) qs = j;
144	1.63M	}
145	22.3M	else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) {
146	24.4k	const unsigned int x = (n1 << 4) \| n2;
147	24.4k	if (!encoded_chars_http_uri_reqd[x]
148	15.8k	&& (qs < 0
149	15.8k	? (x != '/' && x != '?')
150	15.8k	: (x != '&' && x != '=' && x != ';' && x != '+'))) {
151	9.09k	p[j] = x;
152	9.09k	}
153	15.3k	else {
154	15.3k	p[j] = '%';
155	15.3k	p[++j] = hex_chars_uc[n1]; /(s[i+1] & 0xdf)/
156	15.3k	p[++j] = hex_chars_uc[n2]; /(s[i+2] & 0xdf)/
157	15.3k	invalid_utf8 \|= li_utf8_invalid_byte(x);
158	15.3k	}
159	24.4k	i+=2;
160	24.4k	}
161	22.3M	else if (s[i] == '#') break; /* ignore fragment */
162	22.3M	else {
163	22.3M	p[j] = '%';
164	22.3M	p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF];
165	22.3M	p[++j] = hex_chars_uc[s[i] & 0xF];
166	22.3M	invalid_utf8 \|= li_utf8_invalid_byte(s[i]);
167	22.3M	}
168	23.9M	}
169	678	buffer_copy_string_len(b, (char *)p, (size_t)j);
170	678	return !invalid_utf8 ? qs : -2;
171	678	}
172
173
174		static int burl_normalize_basic_required (buffer b, buffer t)
175	884	{
176	884	const unsigned char * const s = (unsigned char *)b->ptr;
177	884	const int used = (int)buffer_clen(b);
178	884	unsigned int n1, n2, x;
179	884	int qs = -1;
180	884	int invalid_utf8 = 0;
181
182	733k	for (int i = 0; i < used; ++i) {
183	732k	if (!encoded_chars_http_uri_reqd[s[i]]) {
184	723k	if (s[i] == '?') qs = i;
185	723k	}
186	9.79k	else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)
187	9.18k	&& (encoded_chars_http_uri_reqd[(x = (n1 << 4) \| n2)]
188	4.02k	\|\| (qs < 0
189	4.02k	? (x == '/' \|\| x == '?')
190	9.11k	: (x == '&' \|\| x == '=' \|\| x == ';' \|\| x == '+')))) {
191	9.11k	invalid_utf8 \|= li_utf8_invalid_byte(x);
192	9.11k	if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */
193	9.11k	if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */
194	9.11k	i+=2;
195	9.11k	}
196	680	else if (s[i] == '#') { /* ignore fragment */
197	2	buffer_truncate(b, (size_t)i);
198	2	break;
199	2	}
200	678	else {
201	678	qs = burl_normalize_basic_required_fix(b, t, i, qs);
202	678	break;
203	678	}
204	732k	}
205
206	884	return !invalid_utf8 ? qs : -2;
207	884	}
208
209
210		static int burl_contains_ctrls (const buffer *b)
211	345	{
212	345	const char * const s = b->ptr;
213	345	const int used = (int)buffer_clen(b);
214	12.3M	for (int i = 0; i < used; ++i) {
215	12.3M	if (s[i] == '%' && (s[i+1] < '2' \|\| (s[i+1] == '7' && s[i+2] == 'F')))
216	83	return 1;
217	12.3M	}
218	262	return 0;
219	345	}
220
221
222		static void burl_normalize_qs20_to_plus_fix (buffer *b, int i)
223	101	{
224	101	char * const s = b->ptr;
225	101	const int used = (int)buffer_clen(b);
226	101	int j = i;
227	5.65M	for (; i < used; ++i, ++j) {
228	5.65M	s[j] = s[i];
229	5.65M	if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') {
230	1.06k	s[j] = '+';
231	1.06k	i+=2;
232	1.06k	}
233	5.65M	}
234	101	buffer_truncate(b, j);
235	101	}
236
237
238		static void burl_normalize_qs20_to_plus (buffer *b, int qs)
239	408	{
240	408	const char * const s = b->ptr;
241	408	const int used = qs < 0 ? 0 : (int)buffer_clen(b);
242	408	int i;
243	408	if (qs < 0) return;
244	11.1M	for (i = qs+1; i < used; ++i) {
245	11.1M	if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break;
246	11.1M	}
247	408	if (i != used) burl_normalize_qs20_to_plus_fix(b, i);
248	408	}
249
250
251		static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i)
252	196	{
253	196	char * const s = b->ptr;
254	196	const int blen = (int)buffer_clen(b);
255	196	const int used = qs < 0 ? blen : qs;
256	196	int j = i;
257	23.7M	for (; i < used; ++i, ++j) {
258	23.7M	s[j] = s[i];
259	23.7M	if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
260	2.38k	s[j] = '/';
261	2.38k	i+=2;
262	2.38k	}
263	23.7M	}
264	196	if (qs >= 0) {
265	69	const int qslen = blen - qs;
266	69	memmove(s+j, s+qs, (size_t)qslen);
267	69	qs = j;
268	69	j += qslen;
269	69	}
270	196	buffer_truncate(b, j);
271	196	return qs;
272	196	}
273
274
275		static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags)
276	1.22k	{
277		/("%2F" must already have been uppercased during normalization)/
278	1.22k	const char * const s = b->ptr;
279	1.22k	const int used = qs < 0 ? (int)buffer_clen(b) : qs;
280	50.0M	for (int i = 0; i < used; ++i) {
281	50.0M	if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') {
282	201	return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE)
283	201	? burl_normalize_2F_to_slash_fix(b, qs, i)
284	201	: -2; /(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)/
285	201	}
286	50.0M	}
287	1.02k	return qs;
288	1.22k	}
289
290
291		static int burl_normalize_path (buffer b, buffer t, int qs, int flags)
292	1.16k	{
293	1.16k	const unsigned char * const s = (unsigned char *)b->ptr;
294	1.16k	const int used = (int)buffer_clen(b);
295	1.16k	int path_simplify = 0;
296	20.5k	for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) {
297	19.7k	if (s[i] == '.' && (s[i+1] != '.' \|\| ++i)
298	12.4k	&& (s[i+1] == '/' \|\| s[i+1] == '?' \|\| s[i+1] == '\0')) {
299	233	path_simplify = 1;
300	233	break;
301	233	}
302	46.7M	while (i < len && s[i] != '/') ++i;
303	19.5k	if (s[i] == '/' && s[i+1] == '/') { /(s[len] != '/')/
304	178	path_simplify = 1;
305	178	break;
306	178	}
307	19.5k	}
308
309	1.16k	if (path_simplify) {
310	411	if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2;
311	388	if (qs >= 0) {
312	149	buffer_copy_string_len(t, b->ptr+qs, used - qs);
313	149	buffer_truncate(b, qs);
314	149	}
315
316	388	buffer_path_simplify(b);
317
318	388	if (qs >= 0) {
319	149	qs = (int)buffer_clen(b);
320	149	buffer_append_string_len(b, BUF_PTR_LEN(t));
321	149	}
322	388	}
323
324	1.14k	return qs;
325	1.16k	}
326
327
328		__attribute_cold__
329		__attribute_noinline__
330		__attribute_pure__
331	437	static int burl_scan_qmark (const buffer * const b) {
332	437	const char * const qmark = strchr(b->ptr, '?');
333	437	return qmark ? (int)(qmark - b->ptr) : -1;
334	437	}
335
336
337		int burl_normalize (buffer b, buffer t, int flags)
338	1.63k	{
339	1.63k	int qs;
340
341		#if defined(_WIN32) \|\| defined(__CYGWIN__)
342		/* Windows and Cygwin treat '\\' as '/' if '\\' is present in path;
343		* convert to '/' for consistency before percent-encoding
344		* normalization which will convert '\\' to "%5C" in the URL.
345		* (Clients still should not be sending '\\' unencoded in requests.) */
346		if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) {
347		for (char p = b->ptr; p != '?' && *p != '\0'; ++p) {
348		if (p == '\\') p = '/';
349		}
350		}
351		#endif
352
353	1.63k	qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED)
354	1.63k	? burl_normalize_basic_required(b, t)
355	1.63k	: burl_normalize_basic_unreserved(b, t);
356	1.63k	if (-2 == qs) {
357	532	if (flags & HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT) return -2;
358	437	qs = burl_scan_qmark(b);
359	437	}
360
361	1.54k	if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) {
362	345	if (burl_contains_ctrls(b)) return -2;
363	345	}
364
365	1.46k	if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE
366	1.46k	\|HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) {
367	1.22k	qs = burl_normalize_2F_to_slash(b, qs, flags);
368	1.22k	if (-2 == qs) return -2;
369	1.22k	}
370
371	1.45k	if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE
372	1.45k	\|HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) {
373	1.16k	qs = burl_normalize_path(b, t, qs, flags);
374	1.16k	if (-2 == qs) return -2;
375	1.16k	}
376
377	1.43k	if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) {
378	875	if (qs >= 0) burl_normalize_qs20_to_plus(b, qs);
379	875	}
380
381	1.43k	return qs;
382	1.45k	}
383
384
385		static void burl_append_encode_nde (buffer * const b, const char * const str, const size_t len)
386	0	{
387		/* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
388		* unless already percent-encoded (does not double-encode) */
389		/* Note: not checking for invalid UTF-8 */
390	0	char * const p = buffer_string_prepare_append(b, len*3);
391	0	unsigned int n1, n2;
392	0	int j = 0;
393	0	for (unsigned int i = 0; i < len; ++i, ++j) {
394	0	if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
395	0	const unsigned int x = (n1 << 4) \| n2;
396	0	if (burl_is_unreserved((int)x)) {
397	0	p[j] = (char)x;
398	0	}
399	0	else { /* leave UTF-8, control chars, and required chars encoded */
400	0	p[j] = '%';
401	0	p[++j] = str[i+1];
402	0	p[++j] = str[i+2];
403	0	}
404	0	i+=2;
405	0	}
406	0	else if (burl_is_unreserved(str[i])) {
407	0	p[j] = str[i];
408	0	}
409	0	else {
410	0	p[j] = '%';
411	0	p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
412	0	p[++j] = hex_chars_uc[str[i] & 0xF];
413	0	}
414	0	}
415	0	buffer_commit(b, j);
416	0	}
417
418
419		static void burl_append_encode_psnde (buffer * const b, const char * const str, const size_t len)
420	0	{
421		/* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ plus /
422		* unless already percent-encoded (does not double-encode) */
423		/* Note: not checking for invalid UTF-8 */
424	0	char * const p = buffer_string_prepare_append(b, len*3);
425	0	unsigned int n1, n2;
426	0	int j = 0;
427	0	for (unsigned int i = 0; i < len; ++i, ++j) {
428	0	if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) {
429	0	const unsigned int x = (n1 << 4) \| n2;
430	0	if (burl_is_unreserved((int)x)) {
431	0	p[j] = (char)x;
432	0	}
433	0	else { /* leave UTF-8, control chars, and required chars encoded */
434	0	p[j] = '%';
435	0	p[++j] = str[i+1];
436	0	p[++j] = str[i+2];
437	0	}
438	0	i+=2;
439	0	}
440	0	else if (burl_is_unreserved(str[i]) \|\| str[i] == '/') {
441	0	p[j] = str[i];
442	0	}
443	0	else {
444	0	p[j] = '%';
445	0	p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
446	0	p[++j] = hex_chars_uc[str[i] & 0xF];
447	0	}
448	0	}
449	0	buffer_commit(b, j);
450	0	}
451
452
453		static void burl_append_encode_all (buffer * const b, const char * const str, const size_t len)
454	0	{
455		/* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~
456		* Note: double-encodes any existing '%') */
457		/* Note: not checking for invalid UTF-8 */
458	0	char * const p = buffer_string_prepare_append(b, len*3);
459	0	int j = 0;
460	0	for (unsigned int i = 0; i < len; ++i, ++j) {
461	0	if (burl_is_unreserved(str[i])) {
462	0	p[j] = str[i];
463	0	}
464	0	else {
465	0	p[j] = '%';
466	0	p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF];
467	0	p[++j] = hex_chars_uc[str[i] & 0xF];
468	0	}
469	0	}
470	0	buffer_commit(b, j);
471	0	}
472
473
474		static void burl_offset_tolower (buffer * const b, const size_t off)
475	0	{
476		/(skips over all percent-encodings, including encoding of alpha chars)/
477	0	for (char *p = b->ptr+off; p[0]; ++p) {
478	0	if (light_isupper(p[0])) p[0] \|= 0x20;
479	0	else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
480	0	p+=2;
481	0	}
482	0	}
483
484
485		static void burl_offset_toupper (buffer * const b, const size_t off)
486	0	{
487		/(skips over all percent-encodings, including encoding of alpha chars)/
488	0	for (char *p = b->ptr+off; p[0]; ++p) {
489	0	if (light_islower(p[0])) p[0] &= 0xdf;
490	0	else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2]))
491	0	p+=2;
492	0	}
493	0	}
494
495
496		void burl_append (buffer * const b, const char * const str, const size_t len, const int flags)
497	0	{
498	0	size_t off = 0;
499
500	0	if (0 == len) return;
501
502	0	if (0 == flags) {
503	0	buffer_append_string_len(b, str, len);
504	0	return;
505	0	}
506
507	0	if (flags & (BURL_TOUPPER\|BURL_TOLOWER)) off = buffer_clen(b);
508
509	0	if (flags & BURL_ENCODE_NONE) {
510	0	buffer_append_string_len(b, str, len);
511	0	}
512	0	else if (flags & BURL_ENCODE_ALL) {
513	0	burl_append_encode_all(b, str, len);
514	0	}
515	0	else if (flags & BURL_ENCODE_NDE) {
516	0	burl_append_encode_nde(b, str, len);
517	0	}
518	0	else if (flags & BURL_ENCODE_PSNDE) {
519	0	burl_append_encode_psnde(b, str, len);
520	0	}
521	0	else if (flags & BURL_ENCODE_B64U) {
522	0	const unsigned char s = (const unsigned char )str;
523	0	buffer_append_base64_encode_no_padding(b, s, len, BASE64_URL);
524	0	}
525	0	else if (flags & BURL_DECODE_B64U) {
526	0	buffer_append_base64_decode(b, str, len, BASE64_URL);
527	0	}
528
529		/* note: not normalizing str, which could come from arbitrary header,
530		* so it is possible that alpha chars are percent-encoded upper/lowercase */
531	0	if (flags & (BURL_TOLOWER\|BURL_TOUPPER)) {
532	0	(flags & BURL_TOLOWER)
533	0	? burl_offset_tolower(b, off) /(flags & BURL_TOLOWER)/
534	0	: burl_offset_toupper(b, off); /(flags & BURL_TOUPPER)/
535	0	}
536	0	}