/src/hoextdown/src/escape.c
Line | Count | Source |
1 | | #include "escape.h" |
2 | | |
3 | | #include <assert.h> |
4 | | #include <stdio.h> |
5 | | #include <string.h> |
6 | | |
7 | | |
8 | 30.1M | #define likely(x) __builtin_expect((x),1) |
9 | | #define unlikely(x) __builtin_expect((x),0) |
10 | | |
11 | | |
12 | | /* |
13 | | * The following characters will not be escaped: |
14 | | * |
15 | | * -_.+!*'(),%#@?=;:/,+&$ alphanum |
16 | | * |
17 | | * Note that this character set is the addition of: |
18 | | * |
19 | | * - The characters which are safe to be in an URL |
20 | | * - The characters which are *not* safe to be in |
21 | | * an URL because they are RESERVED characters. |
22 | | * |
23 | | * We assume (lazily) that any RESERVED char that |
24 | | * appears inside an URL is actually meant to |
25 | | * have its native function (i.e. as an URL |
26 | | * component/separator) and hence needs no escaping. |
27 | | * |
28 | | * There are two exceptions: the chacters & (amp) |
29 | | * and ' (single quote) do not appear in the table. |
30 | | * They are meant to appear in the URL as components, |
31 | | * yet they require special HTML-entity escaping |
32 | | * to generate valid HTML markup. |
33 | | * |
34 | | * All other characters will be escaped to %XX. |
35 | | * |
36 | | */ |
37 | | static const uint8_t HREF_SAFE[UINT8_MAX+1] = { |
38 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
39 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
40 | | 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, |
41 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, |
42 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
43 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, |
44 | | 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
45 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, |
46 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
47 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
48 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
49 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
50 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
51 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
52 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
53 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
54 | | }; |
55 | | |
56 | | void |
57 | | hoedown_escape_href(hoedown_buffer *ob, const uint8_t *data, size_t size) |
58 | 258k | { |
59 | 258k | static const char hex_chars[] = "0123456789ABCDEF"; |
60 | 258k | size_t i = 0, mark; |
61 | 258k | char hex_str[3]; |
62 | | |
63 | 258k | hex_str[0] = '%'; |
64 | | |
65 | 6.45M | while (i < size) { |
66 | 6.29M | mark = i; |
67 | 19.0M | while (i < size && HREF_SAFE[data[i]]) i++; |
68 | | |
69 | | /* Optimization for cases where there's nothing to escape */ |
70 | 6.29M | if (mark == 0 && i >= size) { |
71 | 32.6k | hoedown_buffer_put(ob, data, size); |
72 | 32.6k | return; |
73 | 32.6k | } |
74 | | |
75 | 6.25M | if (likely(i > mark)) { |
76 | 219k | hoedown_buffer_put(ob, data + mark, i - mark); |
77 | 219k | } |
78 | | |
79 | | /* escaping */ |
80 | 6.25M | if (i >= size) |
81 | 65.0k | break; |
82 | | |
83 | 6.19M | switch (data[i]) { |
84 | | /* amp appears all the time in URLs, but needs |
85 | | * HTML-entity escaping to be inside an href */ |
86 | 10.2k | case '&': |
87 | 10.2k | HOEDOWN_BUFPUTSL(ob, "&"); |
88 | 10.2k | break; |
89 | | |
90 | | /* the single quote is a valid URL character |
91 | | * according to the standard; it needs HTML |
92 | | * entity escaping too */ |
93 | 9.57k | case '\'': |
94 | 9.57k | HOEDOWN_BUFPUTSL(ob, "'"); |
95 | 9.57k | break; |
96 | | |
97 | | /* the space can be escaped to %20 or a plus |
98 | | * sign. we're going with the generic escape |
99 | | * for now. the plus thing is more commonly seen |
100 | | * when building GET strings */ |
101 | | #if 0 |
102 | | case ' ': |
103 | | hoedown_buffer_putc(ob, '+'); |
104 | | break; |
105 | | #endif |
106 | | |
107 | | /* every other character goes with a %XX escaping */ |
108 | 6.17M | default: |
109 | 6.17M | hex_str[1] = hex_chars[(data[i] >> 4) & 0xF]; |
110 | 6.17M | hex_str[2] = hex_chars[data[i] & 0xF]; |
111 | 6.17M | hoedown_buffer_put(ob, (uint8_t *)hex_str, 3); |
112 | 6.19M | } |
113 | | |
114 | 6.19M | i++; |
115 | 6.19M | } |
116 | 258k | } |
117 | | |
118 | | |
119 | | /** |
120 | | * According to the OWASP rules: |
121 | | * |
122 | | * & --> & |
123 | | * < --> < |
124 | | * > --> > |
125 | | * " --> " |
126 | | * ' --> ' ' is not recommended |
127 | | * / --> / forward slash is included as it helps end an HTML entity |
128 | | * |
129 | | */ |
130 | | static const uint8_t HTML_ESCAPE_TABLE[UINT8_MAX+1] = { |
131 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
132 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
133 | | 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4, |
134 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0, |
135 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
136 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
137 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
138 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
139 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
140 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
141 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
142 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
143 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
144 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
145 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
146 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
147 | | }; |
148 | | |
149 | | static const char *HTML_ESCAPES[] = { |
150 | | "", |
151 | | """, |
152 | | "&", |
153 | | "'", |
154 | | "/", |
155 | | "<", |
156 | | ">" |
157 | | }; |
158 | | |
159 | | void |
160 | | hoedown_escape_html(hoedown_buffer *ob, const uint8_t *data, size_t size, int secure) |
161 | 7.26M | { |
162 | 7.26M | size_t i = 0, mark; |
163 | | |
164 | 30.2M | while (1) { |
165 | 30.2M | mark = i; |
166 | 1.02G | while (i < size && HTML_ESCAPE_TABLE[data[i]] == 0) i++; |
167 | | |
168 | | /* Optimization for cases where there's nothing to escape */ |
169 | 30.2M | if (mark == 0 && i >= size) { |
170 | 6.33M | hoedown_buffer_put(ob, data, size); |
171 | 6.33M | return; |
172 | 6.33M | } |
173 | | |
174 | 23.9M | if (likely(i > mark)) |
175 | 11.7M | hoedown_buffer_put(ob, data + mark, i - mark); |
176 | | |
177 | 23.9M | if (i >= size) break; |
178 | | |
179 | | /* The forward slash is only escaped in secure mode */ |
180 | 22.9M | if (!secure && data[i] == '/') { |
181 | 296k | hoedown_buffer_putc(ob, '/'); |
182 | 22.7M | } else { |
183 | 22.7M | hoedown_buffer_puts(ob, HTML_ESCAPES[HTML_ESCAPE_TABLE[data[i]]]); |
184 | 22.7M | } |
185 | | |
186 | 22.9M | i++; |
187 | 22.9M | } |
188 | 7.26M | } |