/src/fluent-bit/src/flb_unescape.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | |
3 | | /* Fluent Bit |
4 | | * ========== |
5 | | * Copyright (C) 2015-2024 The Fluent Bit Authors |
6 | | * |
7 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | * you may not use this file except in compliance with the License. |
9 | | * You may obtain a copy of the License at |
10 | | * |
11 | | * http://www.apache.org/licenses/LICENSE-2.0 |
12 | | * |
13 | | * Unless required by applicable law or agreed to in writing, software |
14 | | * distributed under the License is distributed on an "AS IS" BASIS, |
15 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | | * See the License for the specific language governing permissions and |
17 | | * limitations under the License. |
18 | | */ |
19 | | |
20 | | #include <fluent-bit/flb_compat.h> |
21 | | #include <fluent-bit/flb_info.h> |
22 | | #include <fluent-bit/flb_log.h> |
23 | | |
24 | | #include <stdlib.h> |
25 | | #include <string.h> |
26 | | #include <inttypes.h> |
27 | | |
28 | | static int octal_digit(char c) |
29 | 439k | { |
30 | 439k | return (c >= '0' && c <= '7'); |
31 | 439k | } |
32 | | |
33 | | static int hex_digit(char c) |
34 | 704k | { |
35 | 704k | return ((c >= '0' && c <= '9') || |
36 | 704k | (c >= 'A' && c <= 'F') || |
37 | 704k | (c >= 'a' && c <= 'f')); |
38 | 704k | } |
39 | | |
40 | | static int u8_wc_toutf8(char *dest, uint32_t ch) |
41 | 137M | { |
42 | 137M | if (ch < 0x80) { |
43 | 129M | dest[0] = (char)ch; |
44 | 129M | return 1; |
45 | 129M | } |
46 | 7.47M | if (ch < 0x800) { |
47 | 16.3k | dest[0] = (ch>>6) | 0xC0; |
48 | 16.3k | dest[1] = (ch & 0x3F) | 0x80; |
49 | 16.3k | return 2; |
50 | 16.3k | } |
51 | 7.45M | if (ch < 0x10000) { |
52 | 76.6k | dest[0] = (ch>>12) | 0xE0; |
53 | 76.6k | dest[1] = ((ch>>6) & 0x3F) | 0x80; |
54 | 76.6k | dest[2] = (ch & 0x3F) | 0x80; |
55 | 76.6k | return 3; |
56 | 76.6k | } |
57 | 7.38M | if (ch < 0x110000) { |
58 | 16.9k | dest[0] = (ch>>18) | 0xF0; |
59 | 16.9k | dest[1] = ((ch>>12) & 0x3F) | 0x80; |
60 | 16.9k | dest[2] = ((ch>>6) & 0x3F) | 0x80; |
61 | 16.9k | dest[3] = (ch & 0x3F) | 0x80; |
62 | 16.9k | return 4; |
63 | 16.9k | } |
64 | 7.36M | return 0; |
65 | 7.38M | } |
66 | | |
67 | 93.6k | static int u8_high_surrogate(uint32_t ch) { |
68 | 93.6k | return ch >= 0xD800 && ch <= 0xDBFF; |
69 | 93.6k | } |
70 | | |
71 | 123k | static int u8_low_surrogate(uint32_t ch) { |
72 | 123k | return ch >= 0xDC00 && ch <= 0xDFFF; |
73 | 123k | } |
74 | | |
75 | 13.4k | static uint32_t u8_combine_surrogates(uint32_t high, uint32_t low) { |
76 | 13.4k | return 0x10000 + (((high - 0xD800) << 10) | (low - 0xDC00)); |
77 | 13.4k | } |
78 | | |
79 | | /* assumes that src points to the character after a backslash |
80 | | returns number of input characters processed */ |
81 | | static int u8_read_escape_sequence(const char *str, int size, uint32_t *dest) |
82 | 437k | { |
83 | 437k | uint32_t ch = 0; |
84 | 437k | char digs[9]="\0\0\0\0\0\0\0\0"; |
85 | 437k | char ldigs[9]="\0\0\0\0\0\0\0\0"; |
86 | 437k | int dno=0, i=1; |
87 | 437k | uint32_t low = 0; |
88 | | |
89 | 437k | ch = (uint32_t)str[0]; /* take literal character */ |
90 | | |
91 | 437k | if (str[0] == 'n') |
92 | 0 | ch = L'\n'; |
93 | 437k | else if (str[0] == 't') |
94 | 0 | ch = L'\t'; |
95 | 437k | else if (str[0] == 'r') |
96 | 0 | ch = L'\r'; |
97 | 437k | else if (str[0] == 'b') |
98 | 0 | ch = L'\b'; |
99 | 437k | else if (str[0] == 'f') |
100 | 0 | ch = L'\f'; |
101 | 437k | else if (str[0] == 'v') |
102 | 1.81k | ch = L'\v'; |
103 | 435k | else if (str[0] == 'a') |
104 | 6.85k | ch = L'\a'; |
105 | 428k | else if (octal_digit(str[0])) { |
106 | 28.7k | i = 0; |
107 | 34.1k | do { |
108 | 34.1k | digs[dno++] = str[i++]; |
109 | 34.1k | } while (i < size && octal_digit(str[i]) && dno < 3); |
110 | 28.7k | ch = strtol(digs, NULL, 8); |
111 | 28.7k | } |
112 | 399k | else if (str[0] == 'x') { |
113 | 50.5k | while (i < size && hex_digit(str[i]) && dno < 2) { |
114 | 29.4k | digs[dno++] = str[i++]; |
115 | 29.4k | } |
116 | 21.0k | if (dno > 0) { |
117 | 16.7k | ch = strtol(digs, NULL, 16); |
118 | 16.7k | } |
119 | 21.0k | } |
120 | 378k | else if (str[0] == 'u') { |
121 | 494k | while (i < size && hex_digit(str[i]) && dno < 4) { |
122 | 389k | digs[dno++] = str[i++]; |
123 | 389k | } |
124 | 104k | if (dno != 4) { |
125 | | /* Incomplete \u escape sequence */ |
126 | 9.62k | if (dno > 0) { |
127 | 5.19k | ch = L'\uFFFD'; |
128 | 5.19k | goto invalid_sequence; |
129 | 5.19k | } |
130 | 9.62k | } |
131 | 99.4k | ch = strtol(digs, NULL, 16); |
132 | 99.4k | if (u8_low_surrogate(ch)) { |
133 | | /* Invalid: low surrogate without preceding high surrogate */ |
134 | 5.79k | ch = L'\uFFFD'; |
135 | 5.79k | goto invalid_sequence; |
136 | 5.79k | } |
137 | 93.6k | else if (u8_high_surrogate(ch)) { |
138 | | /* Handle a surrogate pair. |
139 | | * Note that i is already incremented with 4 here. */ |
140 | 56.1k | if (i + 2 < size && str[i] == '\\' && str[i + 1] == 'u') { |
141 | 28.5k | dno = 0; |
142 | 28.5k | i += 2; /* Skip "\u" */ |
143 | 122k | while (i < size && hex_digit(str[i]) && dno < 4) { |
144 | 94.3k | ldigs[dno++] = str[i++]; |
145 | 94.3k | } |
146 | 28.5k | if (dno != 4) { |
147 | | /* Incomplete low surrogate */ |
148 | 7.39k | if (dno > 0) { |
149 | 4.52k | ch = L'\uFFFD'; |
150 | 4.52k | goto invalid_sequence; |
151 | 4.52k | } |
152 | 7.39k | } |
153 | 24.0k | low = strtol(ldigs, NULL, 16); |
154 | 24.0k | if (u8_low_surrogate(low)) { |
155 | 13.4k | ch = u8_combine_surrogates(ch, low); |
156 | 13.4k | } |
157 | 10.5k | else { |
158 | | /* Invalid: high surrogate not followed by low surrogate */ |
159 | 10.5k | ch = L'\uFFFD'; |
160 | 10.5k | goto invalid_sequence; |
161 | 10.5k | } |
162 | 24.0k | } |
163 | 27.6k | else { |
164 | | /* Invalid: high surrogate not followed by \u */ |
165 | 27.6k | ch = L'\uFFFD'; |
166 | 27.6k | goto invalid_sequence; |
167 | 27.6k | } |
168 | 56.1k | } |
169 | 99.4k | } |
170 | 274k | else if (str[0] == 'U') { |
171 | 286k | while (i < size && hex_digit(str[i]) && dno < 8) { |
172 | 38.7k | digs[dno++] = str[i++]; |
173 | 38.7k | } |
174 | 248k | if (dno > 0) { |
175 | 8.80k | ch = strtol(digs, NULL, 16); |
176 | 8.80k | } |
177 | 248k | } |
178 | | |
179 | 437k | invalid_sequence: |
180 | | |
181 | 437k | *dest = ch; |
182 | | |
183 | 437k | return i; |
184 | 437k | } |
185 | | |
186 | | int flb_unescape_string_utf8(const char *in_buf, int sz, char *out_buf) |
187 | 7.79M | { |
188 | 7.79M | uint32_t ch; |
189 | 7.79M | char temp[4]; |
190 | 7.79M | const char *end; |
191 | 7.79M | const char *next; |
192 | 7.79M | int size; |
193 | | |
194 | | |
195 | 7.79M | int count_out = 0; |
196 | 7.79M | int count_in = 0; |
197 | 7.79M | int esc_in = 0; |
198 | 7.79M | int esc_out = 0; |
199 | | |
200 | 7.79M | end = in_buf + sz; |
201 | 144M | while (in_buf < end && *in_buf && count_in < sz) { |
202 | 137M | next = in_buf + 1; |
203 | 137M | if (next < end && *in_buf == '\\') { |
204 | 1.21M | esc_in = 2; |
205 | 1.21M | switch (*next) { |
206 | 142k | case '"': |
207 | 142k | ch = '"'; |
208 | 142k | break; |
209 | 1.36k | case '\'': |
210 | 1.36k | ch = '\''; |
211 | 1.36k | break; |
212 | 570k | case '\\': |
213 | 570k | ch = '\\'; |
214 | 570k | break; |
215 | 9.30k | case '/': |
216 | 9.30k | ch = '/'; |
217 | 9.30k | break; |
218 | 7.25k | case 'n': |
219 | 7.25k | ch = '\n'; |
220 | 7.25k | break; |
221 | 14.9k | case 'b': |
222 | 14.9k | ch = '\b'; |
223 | 14.9k | break; |
224 | 9.34k | case 't': |
225 | 9.34k | ch = '\t'; |
226 | 9.34k | break; |
227 | 13.7k | case 'f': |
228 | 13.7k | ch = '\f'; |
229 | 13.7k | break; |
230 | 8.11k | case 'r': |
231 | 8.11k | ch = '\r'; |
232 | 8.11k | break; |
233 | 437k | default: |
234 | 437k | size = end - next; |
235 | 437k | if (size > 0) { |
236 | 437k | esc_in = u8_read_escape_sequence(next, size, &ch) + 1; |
237 | 437k | } |
238 | 0 | else { |
239 | | /* because char is unsigned char by default on arm, so we need to do a explicit conversion */ |
240 | 0 | ch = (uint32_t) (signed char) *in_buf; |
241 | 0 | esc_in = 1; |
242 | 0 | } |
243 | 1.21M | } |
244 | 1.21M | } |
245 | 135M | else { |
246 | | /* explicit convert char to signed char */ |
247 | 135M | ch = (uint32_t) (signed char) *in_buf; |
248 | 135M | esc_in = 1; |
249 | 135M | } |
250 | | |
251 | 137M | in_buf += esc_in; |
252 | 137M | count_in += esc_in; |
253 | | |
254 | 137M | esc_out = u8_wc_toutf8(temp, ch); |
255 | 137M | if (esc_out > sz-count_out) { |
256 | 0 | flb_error("Crossing over string boundary"); |
257 | 0 | break; |
258 | 0 | } |
259 | | |
260 | 137M | if (esc_out == 0) { |
261 | 7.36M | out_buf[count_out] = ch; |
262 | 7.36M | esc_out = 1; |
263 | 7.36M | } |
264 | 129M | else if (esc_out == 1) { |
265 | 129M | out_buf[count_out] = (char) temp[0]; |
266 | 129M | } |
267 | 109k | else { |
268 | 109k | memcpy(&out_buf[count_out], temp, esc_out); |
269 | 109k | } |
270 | 137M | count_out += esc_out; |
271 | 137M | } |
272 | 7.79M | if (count_in < sz) { |
273 | 1.95k | flb_error("Not at boundary but still NULL terminating : %d - '%s'", sz, in_buf); |
274 | 1.95k | } |
275 | 7.79M | out_buf[count_out] = '\0'; |
276 | 7.79M | return count_out; |
277 | 7.79M | } |
278 | | |
279 | | int flb_unescape_string(const char *buf, int buf_len, char **unesc_buf) |
280 | 13.5k | { |
281 | 13.5k | int i = 0; |
282 | 13.5k | int j = 0; |
283 | 13.5k | char *p; |
284 | 13.5k | char n; |
285 | | |
286 | 13.5k | p = *unesc_buf; |
287 | 3.62M | while (i < buf_len) { |
288 | 3.60M | if (buf[i] == '\\') { |
289 | 35.9k | if (i + 1 < buf_len) { |
290 | 34.9k | n = buf[i + 1]; |
291 | 34.9k | if (n == 'n') { |
292 | 780 | p[j++] = '\n'; |
293 | 780 | i++; |
294 | 780 | } |
295 | 34.1k | else if (n == 'a') { |
296 | 622 | p[j++] = '\a'; |
297 | 622 | i++; |
298 | 622 | } |
299 | 33.5k | else if (n == 'b') { |
300 | 956 | p[j++] = '\b'; |
301 | 956 | i++; |
302 | 956 | } |
303 | 32.6k | else if (n == 't') { |
304 | 4.47k | p[j++] = '\t'; |
305 | 4.47k | i++; |
306 | 4.47k | } |
307 | 28.1k | else if (n == 'v') { |
308 | 467 | p[j++] = '\v'; |
309 | 467 | i++; |
310 | 467 | } |
311 | 27.6k | else if (n == 'f') { |
312 | 932 | p[j++] = '\f'; |
313 | 932 | i++; |
314 | 932 | } |
315 | 26.7k | else if (n == 'r') { |
316 | 1.07k | p[j++] = '\r'; |
317 | 1.07k | i++; |
318 | 1.07k | } |
319 | 25.6k | else if (n == '\\') { |
320 | 5.79k | p[j++] = '\\'; |
321 | 5.79k | i++; |
322 | 5.79k | } |
323 | 34.9k | i++; |
324 | 34.9k | continue; |
325 | 34.9k | } |
326 | 990 | else { |
327 | 990 | i++; |
328 | 990 | } |
329 | 35.9k | } |
330 | 3.57M | p[j++] = buf[i++]; |
331 | 3.57M | } |
332 | 13.5k | p[j] = '\0'; |
333 | 13.5k | return j; |
334 | 13.5k | } |
335 | | |
336 | | |
337 | | /* mysql unquote */ |
338 | | int flb_mysql_unquote_string(char *buf, int buf_len, char **unesc_buf) |
339 | 1.85k | { |
340 | 1.85k | int i = 0; |
341 | 1.85k | int j = 0; |
342 | 1.85k | char *p; |
343 | 1.85k | char n; |
344 | | |
345 | 1.85k | p = *unesc_buf; |
346 | 891k | while (i < buf_len) { |
347 | 889k | if ((n = buf[i++]) != '\\') { |
348 | 776k | p[j++] = n; |
349 | 776k | } else if(i >= buf_len) { |
350 | 597 | p[j++] = n; |
351 | 112k | } else { |
352 | 112k | n = buf[i++]; |
353 | 112k | switch(n) { |
354 | 1.33k | case 'n': |
355 | 1.33k | p[j++] = '\n'; |
356 | 1.33k | break; |
357 | 6.51k | case 'r': |
358 | 6.51k | p[j++] = '\r'; |
359 | 6.51k | break; |
360 | 747 | case 't': |
361 | 747 | p[j++] = '\t'; |
362 | 747 | break; |
363 | 3.24k | case '\\': |
364 | 3.24k | p[j++] = '\\'; |
365 | 3.24k | break; |
366 | 1.13k | case '\'': |
367 | 1.13k | p[j++] = '\''; |
368 | 1.13k | break; |
369 | 92.9k | case '\"': |
370 | 92.9k | p[j++] = '\"'; |
371 | 92.9k | break; |
372 | 2.33k | case '0': |
373 | 2.33k | p[j++] = 0; |
374 | 2.33k | break; |
375 | 684 | case 'Z': |
376 | 684 | p[j++] = 0x1a; |
377 | 684 | break; |
378 | 3.57k | default: |
379 | 3.57k | p[j++] = '\\'; |
380 | 3.57k | p[j++] = n; |
381 | 3.57k | break; |
382 | 112k | } |
383 | 112k | } |
384 | 889k | } |
385 | 1.85k | p[j] = '\0'; |
386 | 1.85k | return j; |
387 | 1.85k | } |